## Load Packages and Set Arguments

In [1]:
import json
import re
import uuid

In [2]:
raw_data_path = "raw-news_category-100.example.json"
processed_data_path = "processed-news_category-100.exmpale.json"
processed_data_es_path = "processed-news_category-100.es.exmpale.json"

## Utility functions

In [3]:
def dump_json_list(out_path: str, json_data: list): 

    # json data -> str
    out_jstr_list = []
    for d in json_data: 
        out_jstr_list.append(json.dumps(d))
    
    # joined and format output str
    joined_out_str = "[\n{body}\n]\n".format(
        body=',\n'.join(out_jstr_list)
    )
    
    # write output str
    with open(out_path, 'w') as ofile: 
        ofile.write(joined_out_str)

## Raw Data
- The test data is forked from [News Category
Dataset](https://www.kaggle.com/datasets/rmisra/news-category-dataset).
- It contains around 200k news headlines from the year 2012 to 2018 obtained from
[HuffPost](https://www.huffpost.com/).

In [4]:
# load raw data
raw_data_list = []
with open(raw_data_path, 'r') as infile: 
    for line in infile: 
        if line.strip():
            raw_data_list.append(json.loads(line))
            
# print debug info
print(json.dumps(raw_data_list[0], indent=4))

{
    "category": "CRIME",
    "headline": "There Were 2 Mass Shootings In Texas Last Week, But Only 1 On TV",
    "authors": "Melissa Jeltsen",
    "link": "https://www.huffingtonpost.com/entry/texas-amanda-painter-mass-shooting_us_5b081ab4e4b0802d69caad89",
    "short_description": "She left her husband. He killed their children. Just another day in America.",
    "date": "2018-05-26"
}


## Processed Data
- split `authors` string into list
- extract `news_id` from `link`
- update keys' names


In [5]:

processed_data_list = []

for data in raw_data_list: 
    
    # split authors
    authors_str = data.get('authors', "")
    author_list = []
    for author in re.split(r"(?i)(?: and |[,])", authors_str): 
        if author.strip(): 
            author_list.append(author.strip())
    
    # extract news_id from url
    link = data.get('link', "")
    news_id = link.rsplit('_', 1)[-1].strip()

    # append processed result
    processed_data_list.append(
        {
            'uuid':        str(uuid.uuid4()),
            'news_id':     news_id,
            'link':        data.get('link', ""),
            'title':       data.get('headline', ""),
            'description': data.get('short_description', ""),
            'date':        data.get('date', ""),
            'authors':     author_list,
            'category':    data.get('category', "")
        }
    )

# print debug info
print(json.dumps(processed_data_list[0], indent=4))

{
    "uuid": "bfc90120-6883-4ef8-90e8-096b4d872878",
    "news_id": "5b081ab4e4b0802d69caad89",
    "link": "https://www.huffingtonpost.com/entry/texas-amanda-painter-mass-shooting_us_5b081ab4e4b0802d69caad89",
    "title": "There Were 2 Mass Shootings In Texas Last Week, But Only 1 On TV",
    "description": "She left her husband. He killed their children. Just another day in America.",
    "date": "2018-05-26",
    "authors": [
        "Melissa Jeltsen"
    ],
    "category": "CRIME"
}


In [6]:
# output processed result
dump_json_list(processed_data_path, processed_data_list)

## Processed Data for Elasticsearch Indexing

In [7]:
processed_data_es_list = []

for data in processed_data_list: 
    processed_data_es_list.append(
        {
            "index":{
                "_id":data['uuid']
            }
        }
    )
    processed_data_es_list.append(data)

In [8]:
# json data -> str
out_jstr_list = []
for d in processed_data_es_list: 
    out_jstr_list.append(json.dumps(d))

# joined and format output str
joined_out_str = "{body}\n".format(
    body='\n'.join(out_jstr_list)
)

# write output str
with open(processed_data_es_path, 'w') as ofile: 
    ofile.write(joined_out_str)