# 1 Introduction
## 1.0 Package imports

In [1]:
import numpy as np
import pandas as pd
import pickle
import os
import json
from newsplease import NewsPlease
from tqdm import tnrange

# 2 Definitions

## 2.0 Parameter definitions

In [2]:
country = 'indonesia'
year = 2017
month = 1
path = "../data/{}/".format(country)

def set_params(country, year, month):
    metadata_path = (path + "metadata/" + str(year) + "/"
                     + str(month).zfill(2) + ".csv")

    fulltext_path = (path + "text/" + str(year) + "/"
                     + str(month).zfill(2) + "/")

    json_path = (path + "json/" + str(year) + "/"
                     + str(month).zfill(2) + "/")
    return metadata_path, fulltext_path, json_path

metadata_path, fulltext_path, json_path = set_params(country, year, month)

if not os.path.exists(json_path):
    os.makedirs(json_path)

cols_to_keep = ['Actor1Code', 'Actor1Name', 'Actor2Code', 'IsRootEvent', 'EventCode', 'CAMEOCodeDescription',
                'EventRootCode', 'QuadClass', 'GoldsteinScale', 'NumMentions', 'AvgTone', 'ActionGeo_FullName',
                'ActionGeo_Lat', 'ActionGeo_Long', 'SOURCEURL', 'title']

## 2.1 Function definitions

In [3]:
def load_obj(idx: int, path: str = fulltext_path) -> "NewsPlease object":
    idx = str(idx).zfill(5)
    with open(path + idx + ".pkl", "rb") as f:
        return pickle.load(f)
    
def save_obj(obj: dict, idx: int, path: str = json_path) -> None:
    idx = str(idx).zfill(5)
    with open(path + idx + ".pkl", 'wb') as fp:
         pickle.dump(obj, fp)
    

# 3 Execution

Merge the `dataframe` row with the `NewsPlease` text to create a `JSON` object as such:

```javascript
{ "id": "00001",
  "country": "IN",
  "url": "https://",
  "date": "MM-DD-YYYY",
  "full_text": string,
  "article_title": string,
  "number_actions": int,
  "actions": {
      1: {'latitude', 'longitude', 'action_type', "goldstein", ...},
      2: {'latitude', 'longitude', 'action_type', "goldstein", ...},
      3: {'latitude', 'longitude', 'action_type', "goldstein", ...},
          },
  }
```

`mapping_dictionary` maps: `{ "url_idx": "dataframe_idx"}`, `url_idx.zfill(5)` is the file name of the NewsPlease article in the `texts` folder.

In [44]:
action_dict = dict(zip(cols_to_keep[:-2], [None] * (len(cols_to_keep) - 2)))

base_dict = {
    'id': None,
    'country': None,
    'url': None,
    'full_text': None,
    'article_title': None,
    'number_actions': None,
    'actions': dict()
}

def process_month(metadata_path, fulltext_path, json_path):
    data = pd.read_csv(metadata_path)

    urls = data['to_scrape'].unique()
    mapping_dictionary = {}
    for i, val in enumerate(urls):
        match = data.index[data['to_scrape'] == urls[i]].tolist()
        mapping_dictionary[i] = match 

    for i, val in enumerate(urls):
        if os.path.exists(fulltext_path + str(i).zfill(5) + ".pkl"):
            #print(fulltext_path + str(i).zfill(5) + ".pkl")
            fulltext = load_obj(i, path = fulltext_path)
            metadata = data.iloc[mapping_dictionary[i]]
            metadata = metadata[cols_to_keep].reset_index()

            item_dict = base_dict.copy()
            item_dict['id'] = str(i).zfill(5)
            item_dict['country'] = country
            item_dict['url'] = fulltext.url
            item_dict['article_title'] = fulltext.title
            item_dict['date'] = fulltext.date_publish
            item_dict['language'] = fulltext.language
            item_dict['number_actions'] = len(metadata)
            item_dict['text'] = fulltext.text

            actions_dict = {}
            for action in range(len(metadata)):
                metadata_i = list(metadata.iloc[action][1:])
                action_dict_i = dict(zip(cols_to_keep, metadata_i))
                actions_dict[action] = action_dict_i

            item_dict['actions'] = actions_dict
            save_obj(item_dict, i, json_path)

In [45]:
for year in [2017, 2018, 2019]:
    for month in tnrange(1, 12):
        metadata_path, fulltext_path, json_path = set_params(country, year, month)
        process_month(metadata_path, fulltext_path, json_path)


  


HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))




In [95]:
import pickle as pkl
#x = pkl.load("../data/indonesia/json/2017/01/00002.pkl")
with open("../data/brazil/json/2017/04/00475.pkl", "rb") as f:
    print(pkl.load(f))

{'id': '00475', 'country': 'brazil', 'url': 'http://www.dailymail.co.uk/news/article-4445800/Amazonian-protesters-hurl-spears-Brazilian-riot-police.html', 'full_text': None, 'article_title': 'Amazonian protesters hurl spears at Brazilian riot police', 'number_actions': 1, 'actions': {0: {'Actor1Code': 'BRA', 'Actor1Name': 'BRASILIA', 'Actor2Code': nan, 'IsRootEvent': 1, 'EventCode': 141, 'CAMEOCodeDescription': ' Demonstrate or rally', 'EventRootCode': 14, 'QuadClass': 3, 'GoldsteinScale': -6.5, 'NumMentions': 10, 'AvgTone': -8.91089108910891, 'ActionGeo_FullName': 'Amapa, MaranhãBR, Brazil', 'ActionGeo_Lat': -3.14056, 'ActionGeo_Long': -43.2933, 'SOURCEURL': 'http://www.dailymail.co.uk/news/article-4445800/Amazonian-protesters-hurl-spears-Brazilian-riot-police.html', 'title': 'Amazonian protesters hurl spears Brazilian riot police'}}, 'date': datetime.datetime(2017, 4, 26, 0, 58, 25), 'language': 'en', 'text': "Advertisement\nThousands of members of an indigenous Brazilian tribe are c

In [6]:
x

<NewsArticle.NewsArticle at 0x122b0cf28>