# 1 Introduction

This notebook scrapes [GDELT](https://www.gdeltproject.org) for all news events on an input day, and saves to a CSV file all events that were geolocated to an input country.


On Mac OSX, multiprocessing can cause a segfault in one of the workers which hangs the process. This can either be fixed by running Python 3.8, or by setting `os.environ['no_proxy'] = "*"`

Additionally, multiprocessing does not work well in Jupyter notebooks. It is not suggested to run this notebook. Rather, run `download.py` from the terminal instead, otherwise the multiprocessing subcalls by `gdelt` may hang up.

## 1.0 Package imports

In [2]:
import gdelt
import os
from tqdm import tnrange
import sys
import warnings
import pandas as pd
import requests
import urllib3
import time

os.environ['no_proxy'] = "*"
%load_ext autoreload
%autoreload 2

if not sys.warnoptions:
    warnings.simplefilter("ignore")

## 1.1 Debugger template

In [3]:
class Debugger(object):
    enabled = False
    def __init__(self, func):
        self.func = func

    def __call__(self, *args, **kwargs):
        if self.enabled:
            print('Entering', self.func.__name__)
            print('    args:', args, kwargs)
        return self.func(*args, **kwargs)

Debugger.enabled = True

# 2 Data downloading

## 2.0 Constants

In [4]:
lengths = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
gd2 = gdelt.gdelt(version=2)

## 2.1 Parameters

In [5]:
countries = {"brazil": "BR",
            "indonesia" :"ID",
            "mexico": "MX",}

year = "2019"

folders = {"brazil": "../data/brazil/raw/{}".format(year),
           "indonesia": "../data/indonesia/raw/{}".format(year),
           "mexico": "../data/mexico/raw/{}".format(year),
          }

## 2.3 Function definitions

In [8]:
@Debugger
def pull_day(year, month, day, folders):
    for country, folder in folders.items():
        if not os.path.exists(folder):
            os.makedirs(folder)

    day = str(day).zfill(2)
    if not os.path.exists("{}/2019{}{}.csv".format(out_folder,
                                                       month,
                                                       day)):
        gd2 = gdelt.gdelt(version=2)
        results = gd2.Search(['{} {} {}'.format(year, month, day)],
                                 table='events',coverage=True)
        return results

In [9]:
def pull_month(month, 
               countries = countries,
               year = year,
               folders = folders):
    '''Scrapes GDELT for each day in the input month,
       saving results to a specified output folder for each day
    '''
    
    for country, folder in folders.items():
        if not os.path.exists(folder):
            os.makedirs(folder)
    
    for date in tnrange(1, lengths[month - 1] + 1):
        month = str(month).zfill(2)
        for attempt in range(5):
            try:
                date = str(date).zfill(2)
                if not os.path.exists("{}/{}{}{}.csv".format(folders['brazil'],
                                                             year,
                                                             month,
                                                             date)):
                    results = gd2.Search(['{} {} {}'.format(year, month, date)],
                                         table='events',coverage=True)
                    
                    for country, idx in countries.items():
                        print(results['ActionGeo_CountryCode'].unique())
                        
                        temp = results[results['ActionGeo_CountryCode'] == idx]
                        print(temp.shape)
                        temp.to_csv("{}/{}{}{}.csv".format(folders[country], year, month, date))

            except (requests.exceptions.ReadTimeout, 
                    requests.exceptions.ConnectionError,
                    urllib3.exceptions.MaxRetryError) as e:
                print(e, attempt)
                time.sleep(attempt*10)
                continue    

## 2.4 Function execution

In [None]:
[pull_month(x) for x in range(1, 13)]