# Query News Articles with Named Entities

## Prepare the data and import common libs

In [1]:
from tqdm import tqdm
from datasets import load_dataset

dataset = load_dataset("cnn_dailymail", "3.0.0")

Found cached dataset cnn_dailymail (/Users/timowang/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)


  0%|          | 0/3 [00:00<?, ?it/s]

In [2]:
# For this activity, we will use the test articles only.
articles = dataset["test"]["article"]
print("len(articles)", len(articles))

len(articles) 11490


## Apply NER to index articles with their named entities

In [3]:
import spacy

nlp = spacy.load("en_core_web_sm")

2022-10-27 19:16:14.948386: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [17]:
# TODO: Define a function that maps entities to the corresponding article indices and reverse.
#       The function should only consider a list of selected named entity labels


def link_entity_and_article_indices(articles, named_entities=[]):
    label2ent2article_idxes = dict()
    article_idx2label2ents = dict()

    for idx, article in tqdm(enumerate(articles)):
        doc = nlp(article)
        article_idx2label2ents[idx] = dict()
        for x in doc.ents:
            if not named_entities or x.label_ in named_entities:
                if x.label_ not in label2ent2article_idxes:
                    label2ent2article_idxes[x.label_] = dict()
                if x.text not in label2ent2article_idxes[x.label_]:
                    label2ent2article_idxes[x.label_][x.text] = []
                label2ent2article_idxes[x.label_][x.text].append(idx)
                if x.label_ not in article_idx2label2ents[idx]:
                    article_idx2label2ents[idx][x.label_] = set()
                article_idx2label2ents[idx][x.label_].add(x.text)

    return label2ent2article_idxes, article_idx2label2ents

In [18]:
# Apply link_entity_and_article_indices to generate the indexing dictionaries for the following named entities
# GPE, ORG, PERSON

label2ent2article_idxes, article_idx2label2ents = \
    link_entity_and_article_indices(articles, 
                                  named_entities=["GPE", "ORG", "PERSON"])

11490it [14:18, 13.39it/s]


## Query for articles that contain specified entities

In [24]:
# TODO: Implement the function that takes label2ent2article_idxes and a dictionary 
#       that maps entity labels to entity values (label2ents) and returns a list of article 
#       indices that contain the specified entities.

def query_articles_with_entities(label2ent2article_idxes, label2ents):
    article_idxes = set()
    
    for label in label2ents:
        for ent in label2ents[label]:
            if not ent in label2ent2article_idxes[label]:
                continue
            _article_idxes = label2ent2article_idxes[label][ent]
            if not article_idxes:
                article_idxes = set(_article_idxes)
            else:
                article_idxes = article_idxes.intersection(_article_idxes)
    return article_idxes

In [27]:
# Get a list of articles that contain certain entities

article_idxes = query_articles_with_entities(label2ent2article_idxes, {"GPE": ["Chicago"], "PERSON": ["Barack Obama"]})
print(len(article_idxes))

7


## Display entities contained in an article

In [28]:
# TODO: Implement the function that takes a list of article, article_idx2label2ents and 
#       an article index as input and prints out the article content as well as
#       related entities

def print_article_content(articles, article_idx2label2ents, article_idx):
    print(articles[article_idx])
    for label in article_idx2label2ents[article_idx]:
        print("%s" % label)
        for ent in article_idx2label2ents[article_idx][label]:
            print("\t%s" % ent)
            

In [29]:
for article_idx in article_idxes:
    print_article_content(articles, article_idx2label2ents, article_idx)

(CNN)Jacob Lawrence's groundbreaking work, "The Migration of the Negro," is a series of 60 small paintings with text depicting the lives of millions of pre-World War II blacks as they moved from the oppressive South to the promise of a better life in the North. One frame in particular, No. 22, shows three black men dressed in their Sunday best, handcuffed, heads bowed standing in front of a window adorned with prison bars. The caption reads: . "Another of the social causes of the migrants' leaving was that at times they did not feel safe, or it was not the best thing to be found on the streets late at night. They were arrested on the slightest provocation." Lawrence painted this picture around 1940. This is important to know because there are some people who blame President Barack Obama for racial tension in America as if the March on Washington scrubbed away the lasting effects of 300 years of inequality. No. 22 was painted before hip-hop became the piñata for conservative talk show h