In [2]:
import os
import pandas as pd
import requests
import zipfile
from io import BytesIO
import gdelt
from multiprocessing import cpu_count
from joblib import Parallel, delayed
from datetime import datetime, timedelta
from tqdm import tqdm
import gc

In [3]:
# Create a subfolder called 'GDELT' if it doesn't exist
output_folder = 'GDELT'
os.makedirs(output_folder, exist_ok=True)

In [4]:
gd1 = gdelt.gdelt(version=1)
gd2 = gdelt.gdelt(version=2)

results = gd1.Search(['2020 01 01'],table='events',coverage=False)
results = results[['Actor1CountryCode', 'Actor2CountryCode', 'EventBaseCode', 'SQLDATE', 'NumMentions']]
results.loc[:, 'EventBaseCode'] = results['EventBaseCode'].str[:2]
results = results.dropna()
results = results.groupby(['Actor1CountryCode', 'Actor2CountryCode', 'EventBaseCode', 'SQLDATE'])
results = results.sum().reset_index()

In [5]:
def process_date(date):
    """
    Queries the GDELT database and returns all events with a complete token (i.e. source, target country, action and time) by day.
    GDELT 2 only supports 2015 02 18 and onwards.
    Args:
        date: string. YYYY MM DD
    Returns:
        pandas dataframe with source, target country codes, action and day
    """
    try:
        ver = 1 if int(date[:4]) <= 2016 else 2
        gd = gdelt.gdelt(version=ver)
        results = gd.Search([date],table='events',coverage=True)
        results = results[['Actor1CountryCode', 'Actor2CountryCode', 'EventBaseCode', 'SQLDATE', 'NumMentions']]
        results.loc[:, 'EventBaseCode'] = results['EventBaseCode'].str[:2]
        results = results.dropna()
        # results = results.groupby(['Actor1CountryCode', 'Actor2CountryCode', 'EventBaseCode', 'SQLDATE'])
        # results = results.sum().reset_index()
        results = results[results['EventBaseCode'].str.isnumeric()]

        # print(f"Processed: {date} - {len(results)} rows")
        return results
    except Exception as e:
        print(f'Failed to process {date}: {e}')
        return pd.DataFrame(columns=['Actor1CountryCode', 'Actor2CountryCode', 'EventBaseCode', 'SQLDATE', 'NumMentions'])
    
def process_dates(dates, ver):
    """
    Queries a list of dates instead
    Args:
        dates: list. List of dates of format YYYY MM DD
    Returns
        pandas dataframe, same as above
    """
    results = gdelt.gdelt(version=ver).Search(dates, table='events', coverage=True)
    results = results[['Actor1CountryCode', 'Actor2CountryCode', 'EventBaseCode', 'SQLDATE', 'NumMentions']]
    results.loc[:, 'EventBaseCode'] = results['EventBaseCode'].str[:2]
    results = results.dropna()
    # results = results.groupby(['Actor1CountryCode', 'Actor2CountryCode', 'EventBaseCode', 'SQLDATE'])
    # results = results.sum().reset_index()
    # results = results[results['EventBaseCode'].str.isnumeric()]
    return results

In [37]:
gc.collect()

# For dates up to 2015, you must use GDELT version 1
start_date = datetime(2000, 1, 1)
end_date = datetime(2001, 12, 31)
dates = [(start_date + timedelta(days=i)).strftime('%Y %m %d') for i in range((end_date - start_date).days + 1)]
batch_size = 20
for i in tqdm(range(0, len(dates), batch_size)):
    batch_dates = dates[i:i + batch_size]
    start = batch_dates[0].replace(' ', '_')
    end = batch_dates[-1].replace(' ', '_')
    batch_filename = f"gdelt_{start}_to_{end}.csv"

    filepath = os.path.join(output_folder, batch_filename)
    if not os.path.exists(filepath):
        results = process_dates(batch_dates, ver = 1)
        results.to_csv(filepath, index=False)
        del results
        gc.collect()

100%|██████████| 37/37 [35:27<00:00, 57.50s/it]


In [10]:
gc.collect()

# For dates up to 2015, you must use GDELT version 1
dates = [str(year) for year in list(range(2000, 2021, 1))]
for year in tqdm(dates, desc='Downloading GDELT data'):
    batch_filename = f"gdelt_{year}.csv"

    filepath = os.path.join(output_folder, batch_filename)
    if not os.path.exists(filepath):
        results = process_date(year)
        results.to_csv(filepath, index=False)
        del results
        gc.collect()

Downloading GDELT data: 100%|██████████| 21/21 [12:08<00:00, 34.70s/it]


In [30]:
test_1 = pd.read_csv(os.path.join('GDELT', 'gdelt_2000_01_21_to_2000_02_09.csv'))
test_2 = pd.read_csv(os.path.join('GDELT', 'gdelt_2000_06_09_to_2000_06_15.csv'))
assert test_1.equals(test_2)
test_1 = test_1.sort_values(by='SQLDATE', ascending=True)
test_3 = pd.concat([test_1, test_2])
test_3 = test_3.drop_duplicates()
assert test_1.equals(test_3)

In [31]:
test_1.tail()

Unnamed: 0,Actor1CountryCode,Actor2CountryCode,EventBaseCode,SQLDATE,NumMentions
411271,PSE,EGY,16,20001231,9
9780,AFR,RWA,8,20001231,16
492740,TCD,TCD,18,20001231,9
411104,PSE,EGY,4,20001231,57
411227,PSE,EGY,11,20001231,9


In [23]:
start_date = datetime(2000, 1, 1)
end_date = datetime(2000, 12, 31)
dates = [(start_date + timedelta(days=i)).strftime('%Y %m %d') for i in range((end_date - start_date).days + 1)]
test_result = process_dates(dates, 1)
print(test_result.head())
print(test_result.tail())

  Actor1CountryCode Actor2CountryCode EventBaseCode   SQLDATE  NumMentions
0               ABW               ABW            16  20001027            6
1               ABW               CRI            04  20001215            2
2               ABW               CRI            04  20001216            8
3               ABW               CRI            04  20001222            6
4               ABW               LBR            04  20001216            9
       Actor1CountryCode Actor2CountryCode EventBaseCode   SQLDATE  \
599453               ZWE               ZWE            19  20001111   
599454               ZWE               ZWE            19  20001208   
599455               ZWE               ZWE            19  20001210   
599456               ZWE               ZWE            19  20001221   
599457               ZWE               ZWE            19  20001229   

        NumMentions  
599453            1  
599454            3  
599455            2  
599456            6  
599457            3

In [7]:
test_result = process_date('2001')
print(len(test_result))
print(test_result.head())
print(test_result.tail())
print(pd.unique(test_result['SQLDATE']))

1821597
    Actor1CountryCode Actor2CountryCode EventBaseCode   SQLDATE  NumMentions
503               AFG               AFG            17  20010101            4
506               AFG               BGD            01  20010101           10
507               AFG               BGD            01  20010101            1
508               AFG               BGD            01  20010101            1
512               AFG               CHN            04  20010101            9
        Actor1CountryCode Actor2CountryCode EventBaseCode   SQLDATE  \
4995873               ZWE               USA            04  20011231   
4995874               ZWE               ZWE            04  20011231   
4995875               ZWE               ZWE            01  20011231   
4995876               ZWE               ZWE            04  20011231   
4995877               ZWE               ZWE            01  20011231   

         NumMentions  
4995873            6  
4995874            7  
4995875            1  
4995876    