# 1 Introduction
## 1.0 Package imports

In [131]:
import numpy as np
import pandas as pd
import pickle
import os
from newsplease import NewsPlease

# 2 Definitions

## 2.0 Parameter definitions

In [132]:
country = 'indonesia'
year = 2019
month = 1
path = "../data/{}/".format(country)
metadata_path = (path + "metadata/" + str(year) + "/"
                 + str(month).zfill(2) + ".csv")

fulltext_path = (path + "text/" + str(year) + "/"
                 + str(month).zfill(2) + "/")

cols_to_keep = ['Actor1Code', 'Actor1Name', 'Actor2Code', 'IsRootEvent', 'EventCode', 'CAMEOCodeDescription',
                'EventRootCode', 'QuadClass', 'GoldsteinScale', 'NumMentions', 'AvgTone', 'ActionGeo_FullName',
                'ActionGeo_Lat', 'ActionGeo_Long', 'SOURCEURL', 'title']

## 2.1 Function definitions

In [133]:
def load_obj(idx: int, path: str = fulltext_path) -> "NewsPlease object":
    idx = str(idx).zfill(5)
    with open(path + idx + ".pkl", "rb") as f:
        return pickle.load(f)

def match_metadata(gs_sample):
    month = str(gs_sample['month']).zfill(2)
    idx = gs_sample['ids']
    matching_dictionary = load_dict(month)
    print('This sample matches these rows in {}.csv: {}'.format(month, matching_dictionary[idx]))
    df = pd.read_csv("../data/metadata/variables/{}.csv".format(month))
    return df.iloc[matching_dictionary[idx]]

def convert_json():
    pass

def save_json():
    pass

# 3 Execution

Merge the `dataframe` row with the `NewsPlease` text to create a `JSON` object as such:

```javascript
{ "id": "00001",
  "country": "IN",
  "url": "https://",
  "date": "MM-DD-YYYY",
  "full_text": string,
  "article_title": string,
  "number_actions": int,
  "actions": {
      1: {'latitude', 'longitude', 'action_type', "goldstein", ...},
      2: {'latitude', 'longitude', 'action_type', "goldstein", ...},
      3: {'latitude', 'longitude', 'action_type', "goldstein", ...},
          },
  }
```

`mapping_dictionary` maps: `{ "url_idx": "dataframe_idx"}`, `url_idx.zfill(5)` is the file name of the NewsPlease article in the `texts` folder.

In [134]:
action_dict = dict(zip(cols_to_keep[:-2], [None] * (len(cols_to_keep) - 2)))

base_dict = {
    'id': None,
    'country': None,
    'url': None,
    'full_text': None,
    'article_title': None,
    'number_actions': None,
    'actions': dict()
}

In [135]:
data = pd.read_csv(metadata_path)

urls = data['to_scrape'].unique()
mapping_dictionary = {}
for i, val in enumerate(urls):
    match = data.index[data['to_scrape'] == urls[i]].tolist()
    mapping_dictionary[i] = match 

In [None]:
for i, val in enumerate(urls):
    if os.path.exists(fulltext_path + str(i).zfill(5) + ".pkl"):
        fulltext = load_obj(i)
        text = fulltext.text
        date = fulltext.date_publish
        language = fulltext.language
        metadata = data.iloc[mapping_dictionary[i]]
        metadata = metadata[cols_to_keep].reset_index()

        item_dict = base_dict.copy()
        item_dict['id'] = str(i).zfill(5)
        item_dict['country'] = country
        item_dict['url'] = fulltext.url
        item_dict['article_title'] = fulltext.title
        item_dict['date'] = date
        item_dict['language'] = language
        item_dict['number_actions'] = len(metadata)
        item_dict['text'] = text

        actions_dict = {}
        for action in range(len(metadata)):
            metadata_i = list(metadata.iloc[action])
            action_dict_i = dict(zip(cols_to_keep[:-2], metadata_i))
            actions_dict[action] = action_dict_i

        item_dict['actions'] = actions_dict
        