In [1]:
import json
import requests
import pandas as pd

In [2]:
data = pd.read_csv("../data/articles.csv")

In [3]:
data["text"] = [
    str(row["webTitle"]) + "\n" + str(row["bodyContent"]) for i, row in data.iterrows()
]

In [4]:
data.head()

Unnamed: 0,article_id,sectionName,webTitle,webUrl,bodyContent,webPublicationDate,id,text
0,environment/2022/jun/30/former-australian-chie...,Environment,Former Australian chief scientist to head revi...,https://www.theguardian.com/environment/2022/j...,The former Australian chief scientist and seni...,2022-06-30 23:53:09+00:00,147919,Former Australian chief scientist to head revi...
1,world/2022/jul/01/we-were-too-lenient-on-pro-d...,World news,We were too lenient on pro-democracy politicia...,https://www.theguardian.com/world/2022/jul/01/...,A senior member of Hong Kong’s incoming admini...,2022-06-30 23:52:37+00:00,147920,We were too lenient on pro-democracy politicia...
2,australia-news/2022/jul/01/south-australian-li...,Australia news,South Australian Liberal leader and state MPs ...,https://www.theguardian.com/australia-news/202...,A week after Roe v Wade was overturned in the ...,2022-06-30 23:33:09+00:00,147921,South Australian Liberal leader and state MPs ...
3,australia-news/2022/jul/01/john-barilaros-deci...,Australia news,John Barilaro’s decision may stem some bleedin...,https://www.theguardian.com/australia-news/202...,John Barilaro didn’t want to continue being “a...,2022-06-30 23:23:38+00:00,147922,John Barilaro’s decision may stem some bleedin...
4,lifeandstyle/2022/jul/01/breastfeeding-cogniti...,Life and style,Breastfeeding improves cognitive ability for c...,https://www.theguardian.com/lifeandstyle/2022/...,Children of poorer mothers who breastfeed are ...,2022-06-30 23:01:03+00:00,147923,Breastfeeding improves cognitive ability for c...


# Information extraction pipeline with Diffbot

In [5]:
FIELDS = "entities,sentiment,facts"
HOST = "nl.diffbot.com"
DIFF_TOKEN = "<<DIFFBOT API KEY>>"


def nlp_request(payload):
    try:
        res = requests.post(
            f"https://{HOST}/v1/?fields={FIELDS}&token={DIFF_TOKEN}", json=payload
        )
        return res.json()
    except Exception as e:
        print(f"Failed NLP request due to {e}")

In [6]:
batch_size = 50
nlp_results = list()

for offset in range(0, len(data), batch_size):
    # Batch data
    batch = data["text"][offset : offset + batch_size].to_list()
    batch_ids = data["id"][offset : offset + batch_size].to_list()
    payload = [
        {"content": el, "format": "plain text with title", "lang": "en"} for el in batch
    ]
    # Make request to NLP API
    nlp_response = nlp_request(payload)
    # Parse information
    entities = []
    facts = []
    sentiments = []
    for row in nlp_response:
        # Parse sentiments
        sentiments.append(row['sentiment'])
        # Parse entities
        row_entity = [
            {
                "name": x["name"],
                "uri": x.get("diffbotUri"),
                "confidence": x["confidence"],
                "sentiment": x["sentiment"],
                "type": x["allTypes"][0].get("name"),
            }
            for x in row["entities"]
            if x["allTypes"]
        ]
        entities.append(row_entity)
        # Parse facts
        row_facts = [
            {
                "source": {
                    "name": x["entity"]["name"],
                    "uri": x["entity"]["allTypes"][0].get("diffbotUri")
                },
                "relationship": x["property"]["name"],
                "target": {
                    "name": x["value"]["name"],
                    "uri": x["value"]["allTypes"][0].get("diffbotUri")
                },
                "confidence": x["confidence"],
            }
            for x in row["facts"]
            if x["entity"]["allTypes"]
            and x["value"]["allTypes"]
            and x["entity"]["name"] != x["value"]["name"]
        ]
        facts.append(row_facts)

    # Construct results by appending the article id to extracted NLP information
    for id, entity, fact, sentiment in zip(batch_ids, entities, facts, sentiments):
        nlp_results.append({"id": id, "entity": entity, "fact": fact, 'sentiment': sentiment})

In [7]:
with open("../data/nlp_output.json", "w") as fout:
    json.dump(nlp_results, fout)