# GDELT Pilot
### John Brandt

## Data Acquisition with BigQuery

In [1]:
import pandas as pd
import os

In [None]:
from google.cloud import bigquery
client = bigquery.Client()

query = (
    
    "SELECT SourceCommonName, Amounts, V2Locations, V2Organizations, V2Themes FROM [gdelt-bq:gdeltv2.gkg@-604800000-] "
    'WHERE Amounts LIKE "%trees%"'
    'AND Amounts LIKE "%planted%"'
)
query_job = client.query(
    query,
    location="US",
)

for row in query_job:  # API request - fetches results
    # Row values can be accessed by field name or index
    assert row[0] == row.name == row["name"]
    print(row)

## Load in data

Because each BigQuery call costs about \$0.25 USD, I will load in a CSV during each jupyter session. This notebook currently works with weekly references to tree plantings, but will be functionalized to work with other event detections in the future.

In [2]:
files = os.listdir("../data/external")
data = pd.read_csv("../data/external/" + files[2])

## Locating events

In order to tie events to locations and organizations, we assign the location most closely referenced to the event in the text. 

For each event detected, create a dictionary of the form {index : (number, action)}. This will be matched with a similar location dictionary of the form {index : location} with a grid search.

In [7]:
for i in range(125, 150):                                    # Loop through each entry
    print("\nData point {}".format(i))
    amount = data.iloc[i, 1]
    locs = [x for x in str(data.iloc[i, 2]).split(";")]
    refs = ([x for x in amount.split(";") if "tree" in x])   # Split up the references into value, action, index
    values, actions, indexes = [], [], []
    # Parse into separate lists
    # Generate key, (number, action) dictionary for each entry
    for ref in refs:                                       
        parsed = ref.split(",")
        values.append(int(parsed[0]))
        actions.append(parsed[1])
        indexes.append(int(parsed[2]))
    refs = dict(zip(indexes, zip(values, actions)))        # {index: (number, action)}
    locs_dict = {}
    # Generate key, value pair for each location in each entry
    for loc in locs:
        dict_i = {}
        locs_dict.update( { loc.split("#")[-1] : loc.split("#")[:-1] }) # {index : location}
    # TODO: Implement proposed matching system for ref, loc


Data point 125
dict_keys(['782', '60', '586', '2614', '3458'])
{2661: (6, 'plants between the streets')}

Data point 126
dict_keys(['1473'])
{1513: (6, 'plants between the streets')}

Data point 127
dict_keys(['332', '61', '146'])
{73: (1000000000, 'tree'), 88: (1000000000, 'tree will Be planted')}

Data point 128
dict_keys(['nan'])
{70: (2, 'trees fact')}

Data point 129
dict_keys(['2076', '121', '327', '639', '1507', '2105', '2091', '316', '628', '2066'])
{1206: (1000000000, 'trees will planted')}

Data point 130
dict_keys(['nan'])
{696: (2, 'places where the streets')}

Data point 131
dict_keys(['1222', '215', '506', '552', '51', '244', '447', '850', '879', '1663', '1825', '842', '1817'])
{1900: (2, 'trees planted into Her')}

Data point 132
dict_keys(['1405', '1433', '1555', '1090'])
{1295: (200, 'children planted trees')}

Data point 133
dict_keys(['nan'])
{1016: (5, 'lakh trees planted'), 1881: (10, 'cr LED street lights')}

Data point 134
dict_keys(['31', '191', '721', '807', '

In [None]:
# TODO: Implement summary statistics of matched ref, loc
# TODO: Geocode proposed loc
# TODO: Event isolation / deduplication
# TODO: Develop SVM / RandomForests classifier for (False positive / Planned / Implemented)
# TODO: Port to REST API
# TODO: Attache confidence to each point (define an algorithm)
# TODO: Export to leaflet dashboard

## Identifying patterns in themes

In [8]:
# FIXME: Move to new notebook
# ???: Country
# TODO: Isolate references to restoration interventions
# TODO: Mine relationships

## Identifying important actors and oroganizations in restoration

In [None]:
# FIXME: Movoe to new notebook
# TODO: Develop network of restoration actors by intervention
# ???: Country