# 1 Introduction

This notebook scrapes [GDELT](https://www.gdeltproject.org) for all news events on an input day, and saves to a CSV file all events that were geolocated to an input country.

## 1.0 Package imports

In [1]:
import gdelt
import os
from tqdm import tnrange
import sys
import warnings
import pandas as pd
import requests
import urllib3
import time

%load_ext autoreload
%autoreload 2

if not sys.warnoptions:
    warnings.simplefilter("ignore")

## 1.1 Debugger template

In [2]:
class Debugger(object):
    enabled = False
    def __init__(self, func):
        self.func = func

    def __call__(self, *args, **kwargs):
        if self.enabled:
            print('Entering', self.func.__name__)
            print('    args:', args, kwargs)
        return self.func(*args, **kwargs)

Debugger.enabled = True

# 2 Data downloading

## 2.0 Constants

In [3]:
lengths = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
gd2 = gdelt.gdelt(version=2)

## 2.1 Parameters

In [4]:
countries = {"brazil": "BR",
            "indonesia" :"ID",
            "mexico": "MX",}

year = "2019"

folders = {"brazil": "../data/brazil/raw/{}".format(year),
               "indonesia": "../data/indonesia/raw/{}".format(year),
               "mexico": "../data/mexico/raw/{}".format(year),
          }
               

In [5]:
for key, value in countries.items():
    print(key, value)

brazil BR
indonesia ID
mexico MX


In [6]:
out_folder = '../data/brazil/raw/2019'

## 2.3 Function definitions

In [16]:
@Debugger
def pull_day(year, month, day, folders):
    for country, folder in folders.items():
        if not os.path.exists(folder):
            os.makedirs(folder)

    day = str(day).zfill(2)
    if not os.path.exists("{}/2019{}{}.csv".format(out_folder,
                                                       month,
                                                       day)):
        gd2 = gdelt.gdelt(version=2)
        results = gd2.Search(['{} {} {}'.format(year, month, day)],
                                 table='events',coverage=True)
        return results

In [17]:
def pull_month(month, 
               countries = countries,
               year = year,
               folders = folders):
    '''Scrapes GDELT for each day in the input month,
       saving results to a specified output folder for each day
    '''
    
    for country, folder in folders.items():
        if not os.path.exists(folder):
            os.makedirs(folder)
    
    for date in tnrange(1, lengths[month - 1] + 1):
        month = str(month).zfill(2)
        for attempt in range(5):
            try:
                date = str(date).zfill(2)
                if not os.path.exists("{}/2019{}{}.csv".format(out_folder,
                                                               month,
                                                               date)):
                    results = gd2.Search(['{} {} {}'.format(year, month, date)],
                                         table='events',coverage=True)
                    print(results.shape)
                    
                    for country, idx in countries.items():
                        print(results['ActionGeo_CountryCode'].unique())
                        
                        temp = results[results['ActionGeo_CountryCode'] == idx]
                        print(temp.shape)
                        temp.to_csv("{}/{}{}{}.csv".format(folders[country], year, month, date))

            except (requests.exceptions.ReadTimeout, 
                    requests.exceptions.ConnectionError,
                    urllib3.exceptions.MaxRetryError) as e:
                print(e, attempt)
                time.sleep(attempt*10)
                continue    

## 2.4 Function execution

In [18]:
year = "2019"
month = "06"
day = "05"
results = pull_day(year, month, day, folders)

Entering pull_day
    args: ('2019', '06', '05', {'brazil': '../data/brazil/raw/2019', 'indonesia': '../data/indonesia/raw/2019', 'mexico': '../data/mexico/raw/2019'}) {}


Process ForkPoolWorker-212:
Process ForkPoolWorker-216:
Process ForkPoolWorker-211:
Process ForkPoolWorker-214:
Process ForkPoolWorker-213:


KeyboardInterrupt: 

Process ForkPoolWorker-215:
Process ForkPoolWorker-210:
Process ForkPoolWorker-209:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Users/john.brandt/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/Users/john.brandt/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/Users/john.brandt/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/Users/john.brandt/anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
  File "/Users/john.brandt/anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/john.brandt/anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
    

In [19]:
gd2 = gdelt.gdelt(version=2)
results = gd2.Search(['{} {} {}'.format(year, month, day)],
                         table='events',coverage=True)

Process ForkPoolWorker-325:
Process ForkPoolWorker-324:
Process ForkPoolWorker-326:
Process ForkPoolWorker-321:
Process ForkPoolWorker-327:
Process ForkPoolWorker-322:
Process ForkPoolWorker-323:
Process ForkPoolWorker-328:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Users/john.brandt/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/Users/john.brandt/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/Users/john.brandt/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/Users/john.brandt/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/Users/john.brandt/anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args,

KeyboardInterrupt: 

  File "/Users/john.brandt/anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/john.brandt/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/Users/john.brandt/anaconda3/lib/python3.7/multiprocessing/pool.py", line 110, in worker
    task = get()
  File "/Users/john.brandt/anaconda3/lib/python3.7/multiprocessing/pool.py", line 110, in worker
    task = get()
  File "/Users/john.brandt/anaconda3/lib/python3.7/multiprocessing/pool.py", line 110, in worker
    task = get()
  File "/Users/john.brandt/anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/john.brandt/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/Users/john.brandt/anaconda3/lib/python3.7/multiprocessing/pool.py", line 110, in worker
    