# GDELT Pilot
### John Brandt

## Data Acquisition with BigQuery

In [2]:
import pandas as pd
import os

In [None]:
from google.cloud import bigquery
client = bigquery.Client()

query = (
    
    "SELECT SourceCommonName, Amounts, V2Locations, V2Organizations, V2Themes FROM [gdelt-bq:gdeltv2.gkg@-604800000-] "
    'WHERE Amounts LIKE "%trees%"'
    'AND Amounts LIKE "%planted%"'
)
query_job = client.query(
    query,
    location="US",
)

for row in query_job:  # API request - fetches results
    # Row values can be accessed by field name or index
    assert row[0] == row.name == row["name"]
    print(row)

## Load in data

Because each BigQuery call costs about \$0.25 USD, I will load in a CSV during each jupyter session. This notebook currently works with weekly references to tree plantings, but will be functionalized to work with other event detections in the future.

In [3]:
files = os.listdir("../data/external")
data = pd.read_csv("../data/external/" + files[2])

## Locating events

In order to tie events to locations and organizations, we assign the location most closely referenced to the event in the text. 

For each event detected, create a dictionary of the form {index : (number, action)}. This will be matched with a similar location dictionary of the form {index : location} with a grid search.

In [57]:
def locate_event(i):
    #print("\nData point {}".format(i))
    amount = data.iloc[i, 1]
    locs = [x for x in str(data.iloc[i, 2]).split(";")]
    refs = ([x for x in amount.split(";") if "tree" in x])   # Split up the references into value, action, index
    refs = ([x for x in refs if "plant" in x])
    values, actions, indexes = [], [], []
    # Parse into separate lists
    # Generate key, (number, action) dictionary for each entry
    for ref in refs:                                       
        parsed = ref.split(",")
        values.append(int(parsed[0]))
        actions.append(parsed[1])
        indexes.append(int(parsed[2]))
        
    refs = dict(zip(indexes, zip(values, actions)))        # {index: (number, action)}
    locs_dict = {}
    for loc in locs:                                       # Generate key, value pair for each location in each entry
        dict_i = {}
        locs_dict.update( { loc.split("#")[-1] : loc.split("#")[:-1] }) # {index : location}

    if list(locs_dict.keys()) == ['nan']:                 # if no location, return null
        return None, None
    if len(list(refs.keys())) == 0:                       # if no references, return null
        return None, None
    
    refs_idx = [int(x) for x in list(refs.keys())][0]
    locs_idx = [int(x) for x in list(locs_dict.keys())]
    loc_idx = min(locs_idx, key=lambda x:abs(x-refs_idx))    
    location = locs_dict.get(str(loc_idx))
    return refs, location

In [71]:
for i in range(105, 112):
    refs, location = locate_event(i)
    print("Reference {}: \n {} \n {}\n".format(i, refs, location))
    # TODO: merge refs, location into dataframe
    # TODO: join original themes, locations, events, and people to above DF
    # TODO: confidence
    # FIXME: Why do some of them not have references?

Reference 105: 
 None 
 None

Reference 106: 
 None 
 None

Reference 107: 
 {679: (10, 'cherries planting greenhouses subtree')} 
 ['4', 'Kunming, Yunnan, China', 'CH', 'CH29', '13303', '25.0389', '102.718', '-1913826']

Reference 108: 
 None 
 None

Reference 109: 
 {1230: (70, 'times nationwide tree planting')} 
 ['4', 'Tokoname, Aichi, Japan', 'JA', 'JA01', '33452', '34.8833', '136.85', '-246176']

Reference 110: 
 {1452: (70, 'times nationwide tree planting')} 
 ['4', 'Nagakute, Aichi, Japan', 'JA', 'JA01', '33461', '35.1753', '137.061', '-237670']

Reference 111: 
 {453: (200, 'children planted trees')} 
 ['1', 'Australia', 'AS', 'AS', '', '-25', '135', 'AS']



In [None]:
# TODO: Implement summary statistics of matched ref, loc
# TODO: Geocode proposed loc
# TODO: Event isolation / deduplication
# TODO: Develop SVM / RandomForests classifier for (False positive / Planned / Implemented)
# TODO: Port to REST API
# TODO: Attache confidence to each point (define an algorithm)
# TODO: Export to leaflet dashboard