In [1]:
import pandas as pd
import numpy as np
import random
import spacy
from tqdm import tqdm

random.seed(93)
np.random.seed(93)
spacy.require_gpu()
tqdm.pandas()

In [2]:
coref_data = pd.read_parquet("../data/corref-2023-4.parquet")
coref_data.head()

Unnamed: 0,source,id,category,title,published,body,summary,summary_type
0,reuters,43869,Asian Markets,"Taiwan seen slipping into recession in Q1, Reu...",2023-04-26T04:54:00,"TAIPEI, April 26 (Reuters) - Taiwan's export-d...",* \n* For poll data click:\n* Preliminary Q1 G...,BULLETS
1,reuters,43881,Retail & Consumer,Corona beer maker Constellation sees 2024 prof...,2023-04-06T12:59:00,April 6 (Reuters) - Constellation Brands Inc (...,,
2,reuters,43904,Mergers & AcquisitionsMergers & AcquisitionsDr...,"BioNTech, DualityBio to develop cancer treatme...",2023-04-03T21:28:00,April 3 (Reuters) - Germany's BioNTech (22UAy....,,
3,reuters,43912,CommentaryBy Rebecca ChristieBreakingviews,New EU debt rules have way to avoid past mistakes,2023-04-04T10:32:00,"BRUSSELS, April 4 (Reuters Breakingviews) - Th...",,
4,reuters,43916,CommentaryBy Rebecca ChristieBreakingviews,Rome foot-dragging can help EU kick bad aid ha...,2023-04-18T09:52:00,"BRUSSELS, April 18 (Reuters Breakingviews) - I...",,


In [3]:
coref_data.loc[coref_data.source == "reuters"].head()

Unnamed: 0,source,id,category,title,published,body,summary,summary_type
0,reuters,43869,Asian Markets,"Taiwan seen slipping into recession in Q1, Reu...",2023-04-26T04:54:00,"TAIPEI, April 26 (Reuters) - Taiwan's export-d...",* \n* For poll data click:\n* Preliminary Q1 G...,BULLETS
1,reuters,43881,Retail & Consumer,Corona beer maker Constellation sees 2024 prof...,2023-04-06T12:59:00,April 6 (Reuters) - Constellation Brands Inc (...,,
2,reuters,43904,Mergers & AcquisitionsMergers & AcquisitionsDr...,"BioNTech, DualityBio to develop cancer treatme...",2023-04-03T21:28:00,April 3 (Reuters) - Germany's BioNTech (22UAy....,,
3,reuters,43912,CommentaryBy Rebecca ChristieBreakingviews,New EU debt rules have way to avoid past mistakes,2023-04-04T10:32:00,"BRUSSELS, April 4 (Reuters Breakingviews) - Th...",,
4,reuters,43916,CommentaryBy Rebecca ChristieBreakingviews,Rome foot-dragging can help EU kick bad aid ha...,2023-04-18T09:52:00,"BRUSSELS, April 18 (Reuters Breakingviews) - I...",,


In [4]:
coref_data.loc[coref_data.source == "cnbc"].head()

Unnamed: 0,source,id,category,title,published,body,summary,summary_type
43,cnbc,4419,Economy,Fed Governor Bowman casts doubt on the need fo...,2023-04-18T17:01:56+00:00,Federal Reserve Governor Michelle Bowman expre...,* Federal Reserve Governor Michelle Bowman exp...,BULLETS
44,cnbc,4420,Economy,Layoffs are up nearly fivefold so far this yea...,2023-04-06T12:11:07+00:00,"Google headquarters in Mountain View, Californ...","* Planned layoffs totaled 89,703 for the perio...",BULLETS
46,cnbc,4421,Economy,"Job growth totals 236,000 in March, near expec...",2023-04-07T12:32:36+00:00,"Job growth totals 236,000 in March, near expec...","* Nonfarm payrolls grew by 236,000 for March, ...",BULLETS
47,cnbc,4422,Economy,Key inflation gauge for the Fed rose 0.3% in M...,2023-04-28T12:34:13+00:00,Key inflation gauge for the Fed rose 0.3% in M...,,
49,cnbc,4423,Economy,Inflation rises just 0.1% in March and 5% from...,2023-04-12T12:31:43+00:00,"Inflation rises 0.1% in March, less than expec...",* The consumer price index rose 0.1% in March ...,BULLETS


In [5]:
coref_data.loc[coref_data.source == "nyt"].head()

Unnamed: 0,source,id,category,title,published,body,summary,summary_type
13716,nyt,113630,Well,Why Oral Hygiene Is Crucial to Your Overall He...,2023-04-06T10:36:14-04:00,b'The inside of your mouth is the perfect plac...,Gum disease has been associated with a range o...,PLAIN
13717,nyt,113631,En español,¿Cuánto les importa a los votantes la edad de ...,2023-04-28T07:30:06-04:00,b'Muchos estadounidenses dicen que no quieren ...,Más allá de una crisis de salud o una equivoca...,PLAIN
13718,nyt,113645,Real Estate,How to Find the Right Broker for Selling Your ...,2023-04-08T10:00:05-04:00,"b'Q: I am planning to sell my condo in Harlem,...",New York home sellers spend an average of thre...,PLAIN
13719,nyt,113648,Real Estate,My Landlord Lives Below Me and Hates Noise. Wh...,2023-04-15T05:00:10-04:00,b'Q: My roommate and I live on the third floor...,Establishing whether noise is legally excessiv...,PLAIN
13720,nyt,113650,Real Estate,The Ice Cream Truck Across the Street Is Makin...,2023-04-29T05:00:12-04:00,b'Q: We live in a co-op building opposite a sm...,"Truck operators, like all food vendors in the ...",PLAIN


In [6]:
sentence_splitter = spacy.load("en_core_web_md", enable=["senter"], config={"nlp": {"disabled": []}})
ner_recog = spacy.load("en_core_web_md", enable=["ner"])


def strip_reuter_intro(text):
    text_chunks = text.split(" - ")
    return " - ".join(text_chunks[1:]).strip()


def extract_entities(text):
    sentences = [s.text for s in sentence_splitter(text).sents]
    ner_results = ner_recog.pipe(sentences)
    entities = []
    for doc in ner_results:
        for ent in doc.ents:
            entities.append((ent.label_, ent.text))
    return entities

In [7]:
coref_data["body"] = coref_data.apply(lambda row: strip_reuter_intro(row["body"] if row["source"] == "reuters" else row["body"]), axis=1)
coref_data["entities"] = coref_data.body.progress_apply(extract_entities)
coref_data_ents = coref_data[["id", "entities"]]
coref_data_ents.head()

100%|██████████| 14646/14646 [02:25<00:00, 100.69it/s]


Unnamed: 0,id,entities
0,43869,"[(GPE, Taiwan), (DATE, the first quarter), (OR..."
1,43881,"[(ORG, Constellation Brands Inc), (LOC, STZ.N)..."
2,43904,"[(GPE, Germany), (CARDINAL, 22UAy), (ORG, DE),..."
3,43912,"[(ORG, The European Union’s), (CARDINAL, two),..."
4,43916,"[(GPE, Italy), (GPE, BRUSSELS), (GPE, BRUSSELS..."


In [11]:
coref_data_exploded = coref_data_ents.explode("entities").reset_index(drop=True)
coref_data_exploded[["entity_type", "entity_name"]] = pd.DataFrame(coref_data_exploded.entities.tolist())
coref_data_exploded = coref_data_exploded[["entity_type", "entity_name"]]
coref_data_exploded.head()

Unnamed: 0,entity_type,entity_name
0,GPE,Taiwan
1,DATE,the first quarter
2,ORG,Reuters
3,DATE,Wednesday
4,PERCENT,1.25%


In [12]:
coref_ner_count = coref_data_exploded.groupby("entity_type").count()
coref_ner_count

Unnamed: 0_level_0,entity_name
entity_type,Unnamed: 1_level_1
CARDINAL,56641
DATE,149063
EVENT,2402
FAC,2471
GPE,181694
LANGUAGE,274
LAW,1694
LOC,16306
MONEY,31823
NORP,67576


In [25]:
coref_data_exploded.loc[coref_data_exploded.entity_type == "ORG"]

Unnamed: 0,entity_type,entity_name
2,ORG,Reuters
7,ORG,Reuters
18,ORG,DBS
25,ORG,Apple Inc
26,ORG,Taiwan Semiconductor Manufacturing Co Ltd
...,...,...
978877,ORG,FOXA.O
978880,ORG,Fox Corp
978881,ORG,FOXA.O
978883,ORG,Exclusive -- Fox Corp
