In [1]:
from allennlp.predictors.predictor import Predictor
from collections import namedtuple
import spacy
import math
import json
import tqdm

Item = namedtuple('INFO', ('title', 'section', 'pdate', 'weburl', 'apiurl', 'text', 'ents', 'sentiment'))
Result = namedtuple('Result', ('sid', 'aid', 'pid', 'sent', 'para'))

### allen nlp sentiment model

In [3]:
sentiment_model = "sst-roberta-large-2020.02.17.tar.gz"

In [4]:
sent = "Key Trump adviser claims tax cuts will still pay for themselves even as coronavirus drags on US economy"

In [5]:
sentiment_predictor = Predictor.from_path(sentiment_model)

In [6]:
sentiment_predictor.predict(sentence=sent)

{'logits': [-0.8061320781707764, -0.23963770270347595],
 'probs': [0.36204612255096436, 0.6379538774490356],
 'label': '0'}

In [7]:
def get_sentiment(text):
    pred = sentiment_predictor.predict(sentence=text)
    label = int(pred['label'])
    return -1 if label == 0 else 1

### Spacy ner model

In [9]:
nlp = spacy.load('en_core_web_sm')

In [12]:
def get_entities(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents if ent.label_ in ('ORG', 'GPE', 'NORP', 'FAC')]

def get_text_entities(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents if ent.label_ in ('ORG', 'GPE', 'NORP', 'FAC', 
                                                                         'PERSON', 'PRODUCT', 'MONEY', 'EVENT')]

### extract entity and sentiment information from title

In [13]:
def extract_info(text):
    entities = get_entities(text)
    sentiment = get_sentiment(text)
    return entities,sentiment

def analyse(news_list):
    news = []
    for title, section, date, weburl, apiurl,bodytext in tqdm.tqdm(news_list):
        entities, sentiment = extract_info(title)
        news.append(Item(title=title,
                         section=section,
                         pdate=date,
                         weburl=weburl,
                         apiurl=apiurl,
                         text=bodytext,
                         ents=entities, 
                         sentiment=sentiment,))
    return news

In [13]:
# load dataset that doesnt have NER and sentiment labels
with open('data/news.json', 'r') as f:
    news_list = json.load(f)

In [14]:
analysis = analyse(news_list)

100%|██████████| 4435/4435 [31:39<00:00,  2.34it/s]    


In [15]:
# with open('data/news-text.json', 'w') as f:
#     json.dump(analysis, fp=f)

In [16]:
get_entities(analysis[300].text)

[('America', 'GPE'),
 ('Dow Jones', 'ORG'),
 ('Bear Market', 'ORG'),
 ('American', 'NORP'),
 ('the Federal Government', 'ORG'),
 ('CoronaVirus', 'ORG'),
 ('UK', 'GPE'),
 ('FTSE', 'ORG'),
 ('The Bank of England', 'ORG'),
 ('UK', 'GPE'),
 ('UK', 'GPE'),
 ('the European Central Bank', 'ORG'),
 ('EU', 'ORG'),
 ('Italy', 'GPE')]

In [18]:
analysis[10].text



### Fasttext embedding

In [26]:
from gensim.models.fasttext import FastText
import nltk

def process_sent(sent, remove_punct=True):
    if remove_punct:
        tokens = [token.lower() for token in nltk.word_tokenize(sent.strip()) if len(token)>1 or token.isalnum()]
    else:
        tokens = [token.lower() for token in nltk.word_tokenize(sent.strip())]
    return tokens
    
def flat_corpus(dataset, para_level=False, remove_punct=True):
    corpus = []
    sid2art = {}
    for aid, an in enumerate(tqdm.tqdm(dataset)):
        title = an.title
        paragraphs = [title] + an.text.split("\n")
        if para_level:
            for pid, para in enumerate(paragraphs):
                start_idx = len(sid2art)
                sid2art[start_idx] = {'aid':aid, 'pid':pid}
                tokens = process_sent(para, remove_punct)
                corpus.append(tokens)           
        else:
            for pid, para in enumerate(paragraphs):
                sents = nltk.sent_tokenize(para)
                start_idx = len(sid2art)
                for i in range(len(sents)):
                    sid2art[start_idx+i] = {'aid':aid, 'pid':pid}
                    tokens = process_sent(sents[i], remove_punct)
                    corpus.append(tokens)
    return corpus, sid2art

In [27]:
# laod dataset
with open('data/news-text-unique.json', 'r') as f:
    data = json.load(f)
    analysis = [Item._make(d) for d in data]

In [28]:
line_corpus, sid2aid = flat_corpus(analysis, para_level=False, remove_punct=True)

100%|██████████| 3171/3171 [00:28<00:00, 109.64it/s]


In [29]:
ft_model = FastText(
    sentences=line_corpus, 
    size=100,
    window=5,
    min_count=5,
    word_ngrams=1,
    iter=5,
    negative=5,
    sample=1e-3,
    sg=0,
    alpha=0.025,
    ns_exponent=0.75,
    min_n=3,
    max_n=6)

In [65]:
ft_model.wv.similar_by_word('covid')

[('covid‑19', 0.9635025262832642),
 ('covid-19.', 0.9187116622924805),
 ('covid-19', 0.8985052108764648),
 ('covid-19-related', 0.8692864775657654),
 ('anti-coronavirus', 0.8612541556358337),
 ('coronavirus-driven', 0.8524642586708069),
 ('coronavirus', 0.8481267690658569),
 ('coronavirus-induced', 0.8410760164260864),
 ('coronavirus.', 0.8392269611358643),
 ('coronaviruses', 0.8391625881195068)]

### retrieve documents by sentence embedding

In [31]:
from fse.models import SIF
from fse import IndexedList
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.cluster import DBSCAN, KMeans
from sklearn.neighbors import DistanceMetric
from sklearn import metrics
import numpy as np

In [51]:
def train_fse(fasttext_model, flat_corpus):
    model = SIF(fasttext_model)
    indexed_corpus = IndexedList(flat_corpus)
    model.train(indexed_corpus)
    return model, indexed_corpus

def retrieve_paragraph(mappings, dataset):
    retrieved_paras = []
    for m in mappings:
        article = dataset[m['aid']]
        paras = [article.title] + article.text.split('\n')
        paragraph = paras[m['pid']]
        retrieved_paras.append(paragraph)
    return retrieved_paras

def train_tfidf_vectorizer(dataset):
    tfidf_vectorizer = TfidfVectorizer(binary=True, sublinear_tf=True, norm='l2', ngram_range=(1,2))
    para_corpus, pid2aid = flat_corpus(dataset, para_level=True, remove_punct=True)
    tfidf_vectorizer.fit([' '.join(line) for line in para_corpus])
    return tfidf_vectorizer
    
def rerank(query, results, vectorizer):
    trans_para = vectorizer.transform([query]+[' '.join(process_sent(r.para)) for r in results])
    dense = trans_para.todense()
    sims = metrics.pairwise.cosine_similarity(dense[0], dense[1:])
    sorted_rank = np.argsort(-sims[0]).tolist()
    return [results[sr] for sr in sorted_rank]
    
def search(model, vectorizer, query, sent_corpus, dataset, sid_mapping):
    query = ' '.join(process_sent(query, remove_punct=True))
    sents = model.sv.similar_by_sentence(nltk.word_tokenize(query), model=model)
    sids = [s[0] for s in sents]
    mapping = [sid_mapping[sid] for sid in sids]
    paragraphs = retrieve_paragraph(mapping, dataset)
    
    results = []
    exist = set()
    for i in range(len(sids)):
        sid = sids[i]
        aid = mapping[i]['aid']
        pid = mapping[i]['pid']
        sent = sent_corpus[i]
        para = paragraphs[i]
        if aid not in exist:
            results.append(Result(sid=sid, aid=aid, pid=pid, sent=sent, para=para))
            exist.add(aid)
    return rerank(query, results, vectorizer)

In [33]:
fse_model, indexed_corpus = train_fse(ft_model, line_corpus)
tfidf_vectorizer = train_tfidf_vectorizer(dataset=analysis)

100%|██████████| 3171/3171 [00:22<00:00, 138.49it/s]


In [59]:
query = "human rights coronavirus"
search_results = search(model=fse_model, vectorizer=tfidf_vectorizer, 
                               query=query, sid_mapping=sid2aid,
                               sent_corpus=line_corpus, dataset=analysis)

In [60]:
search_results

[Result(sid=102364, aid=2859, pid=1, sent=['but', 'we', 'also', 'need', 'a', 'way', 'out', 'of', 'this', 'pandemic', 'as', 'fast', 'as', 'possible.', 'farrar', 'who', 'is', 'an', 'infectious', 'disease', 'researcher', 'as', 'well', 'as', 'director', 'of', 'the', 'wellcome', 'trust', 'said', 'drugs', 'vaccines', 'and', 'rapid', 'diagnostics', 'are', 'the', 'only', 'way', 'we', 'have', 'of', 'saving', 'lives', 'bringing', 'this', 'pandemic', 'to', 'an', 'end', 'and', 'preventing', 'it', 'reappearing'], para='When Leilani Farha touches down in a new city, the first thing the UN special rapporteur on the right to adequate housing does is look up. In Melbourne, Toronto, London and Dublin, the skies above are filled with cranes, Farha says, soaring across the skylineto construct new homes for their booming populations. Last week Farha arrived in Wellington on a fact-finding mission, lured by the headlines of a housing crisis, chronic homelessness, and motels bulging with desperate families f

In [61]:
[analysis[r.aid].title for r in search_results]

["'They allowed the perfect storm': UN expert damns New Zealand's housing crisis",
 'If China valued free speech, there would be no coronavirus crisis',
 "China's reaction to the coronavirus violates human rights | Frances Eve",
 'Hungary seeks to end legal recognition of trans people amid Covid-19 crisis',
 "Togo has long been mired in political crisis – and elections won't change that",
 'Samoa turns away eight of its own citizens over coronavirus fears',
 'Rebecca Long-Bailey pledges the environment would be central to Labour policy',
 'UN secretary general: recovery from the coronavirus crisis must lead to a better world | António Guterres',
 'Keir Starmer had no enemies. Can he keep it that way?',

In [58]:
[analysis[r.aid].title for r in search_results]

['Samoa turns away eight of its own citizens over coronavirus fears',
 "'If I get corona, I get corona': the Americans who wish they'd taken Covid-19 seriously",
 'Stock markets end wild week with late Wall Street rally – as it happened',
 'California housing bill’s failure comes amid fierce debate on how to solve crisis',
 'Despite what Trump says, my Chinese students are not the face of this virus | Kristen Brownell',
 'UK to ban mass gatherings in coronavirus U-turn',
 'Faced with the coronavirus, Boris Johnson must stop playing the invisible man | Andrew Rawnsley',
 'Tasmania effectively closes borders to mainland Australia in response to coronavirus']

In [55]:
[a.title for a in analysis[:100]]

['Wellcome Trust urges firms to donate £6bn for Covid-19 research',
 'The coronavirus pandemic threatens a crisis for human rights too | Afua Hirsch',
 'Council workers face increasing abuse amid coronavirus pandemic',
 'Cancer Research UK to cut funding for research by £44m',
 'Australian prime minister to ask global leaders to fund urgent research for coronavirus vaccine',
 "Australia's private health funds could reap windfall from coronavirus – report",
 'Businesses affected by coronavirus should have access to disaster relief funds, says Queensland',
 'Postgraduate students urge funders to extend grants over coronavirus',
 'The coronavirus pandemic: visualising the global crisis',
 'Carnival cruises seeks $6bn funding amid coronavirus fallout',
 'Climate monitoring and research could fall victim to coronavirus, scientists fear',
 'Some UK property funds ban withdrawals over coronavirus',
 'Morning mail: British PM in hospital, Ruby Princess investigation, health fund windfall',
 'E