# 1 Introduction

This notebook scrapes [GDELT](https://www.gdeltproject.org) for all news events on an input day, and saves to a CSV file all events that were geolocated to an input country.

## 1.0 Package imports

In [1]:
import gdelt
import os
from tqdm import tnrange
import sys
import warnings
import pandas as pd
import requests
import urllib3
import time

%load_ext autoreload
%autoreload 2

if not sys.warnoptions:
    warnings.simplefilter("ignore")

## 1.1 Debugger template

In [2]:
class Debugger(object):
    enabled = False
    def __init__(self, func):
        self.func = func

    def __call__(self, *args, **kwargs):
        if self.enabled:
            print('Entering', self.func.__name__)
            print('    args:', args, kwargs)
        return self.func(*args, **kwargs)

Debugger.enabled = True

# 2 Data downloading

## 2.0 Constants

In [3]:
lengths = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
gd2 = gdelt.gdelt(version=2)

## 2.1 Parameters

In [4]:
country = "IN"
year = "2019"
out_folder = "../data/indonesia/raw/"

## 2.3 Function definitions

In [5]:
@Debugger
def pull_month(month, 
               country = country,
               year = year,
               out_folder = out_folder):
    '''Scrapes GDELT for each day in the input month,
       saving results to a specified output folder for each day
    '''
    
    for date in tnrange(1, lengths[month - 1] + 1):
        month = str(month).zfill(2)
        for attempt in range(5):
            try:
                date = str(date).zfill(2)
                if not os.path.exists("{}/2019{}{}.csv".format(out_folder,
                                                               month,
                                                               date)):
                    results = gd2.Search(['{} {} {}'.format(year, month, date)],
                                         table='events',coverage=True)
                    results = results[results['ActionGeo_CountryCode'] == country]
                    results.to_csv("{}/{}{}{}.csv".format(out_folder, year, month, date))

            except (requests.exceptions.ReadTimeout, 
                    requests.exceptions.ConnectionError,
                    urllib3.exceptions.MaxRetryError) as e:
                print(e, attempt)
                time.sleep(attempt*10)
                continue    

## 2.4 Function execution

In [None]:
[pull_month(x) for x in range(1, 13)]

Entering pull_month
    args: (1,) {}


HBox(children=(IntProgress(value=0, max=31), HTML(value='')))


Entering pull_month
    args: (2,) {}


HBox(children=(IntProgress(value=0, max=28), HTML(value='')))


Entering pull_month
    args: (3,) {}


HBox(children=(IntProgress(value=0, max=31), HTML(value='')))


Entering pull_month
    args: (4,) {}


HBox(children=(IntProgress(value=0, max=30), HTML(value='')))


Entering pull_month
    args: (5,) {}


HBox(children=(IntProgress(value=0, max=31), HTML(value='')))


Entering pull_month
    args: (6,) {}


HBox(children=(IntProgress(value=0, max=30), HTML(value='')))


Entering pull_month
    args: (7,) {}


HBox(children=(IntProgress(value=0, max=31), HTML(value='')))


Entering pull_month
    args: (8,) {}


HBox(children=(IntProgress(value=0, max=31), HTML(value='')))


Entering pull_month
    args: (9,) {}


HBox(children=(IntProgress(value=0, max=30), HTML(value='')))


Entering pull_month
    args: (10,) {}


HBox(children=(IntProgress(value=0, max=31), HTML(value='')))


Entering pull_month
    args: (11,) {}


HBox(children=(IntProgress(value=0, max=30), HTML(value='')))


Entering pull_month
    args: (12,) {}


HBox(children=(IntProgress(value=0, max=31), HTML(value='')))