# Script to Download GDELT GKG 1.0


In [None]:
# !pip install -q git+https://github.com/codelucas/newspaper.git
# !pip install -q readability-lxml
# !pip install -q urllib

[K     |████████████████████████████████| 81 kB 5.6 MB/s 
[K     |████████████████████████████████| 7.4 MB 34.5 MB/s 
[K     |████████████████████████████████| 9.6 MB 43.0 MB/s 
[K     |████████████████████████████████| 93 kB 2.7 MB/s 
[?25h  Building wheel for newspaper3k (setup.py) ... [?25l[?25hdone
  Building wheel for tinysegmenter (setup.py) ... [?25l[?25hdone
  Building wheel for feedfinder2 (setup.py) ... [?25l[?25hdone
  Building wheel for jieba3k (setup.py) ... [?25l[?25hdone
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone


In [None]:
# import libraries

import pandas as pd
pd.set_option('display.max_columns', 100)
import numpy as  np
import time
from random import randint
import urllib
import re 
import zipfile
import matplotlib.pyplot as plt
import seaborn as sns
import math

import gc
import warnings
warnings.simplefilter('ignore', FutureWarning)

# Get GKG Data

In [None]:
# define search range

start_date = '2016-08-08'
end_date = '2017-11-28'

def get_dates(_start_date, _end_date):
    '''
    creates a list of dates in specified range
    
    Parameters:
    ———————————
    _start_date: a date (in str format)
    _end_date: a date (in str format)
    
    Outputs:
    ————————
    _date_range: a list of dates
    '''
    _date_range = []
    _range = pd.date_range(start=_start_date,end=_end_date)
    for _date in _range:
        _date = _date.date()
        _date_range.append(str(_date))
    return _date_range

# apply date range function to get dates of Kenya 2017 election cycle
date_range = get_dates(start_date, end_date)

In [None]:
# get data from GKG 1.0 in batches

# define parameters of batches 
idx_date = 0
batch_size_date = 60
total_range = len(date_range)
num_batches_date = math.ceil(total_range / batch_size_date)

for batch in range(num_batches_date):
    print('Batch:', batch)
    # instatiate df for batch of news
    other_gkg = pd.DataFrame()

    for date in date_range[idx_date:idx_date+batch_size_date]:
        print(date)
        _date = re.sub(r'[^\d+]','', date)
        # fetch gdelt gkg data
        resp = urllib.request.urlopen(f"http://data.gdeltproject.org/gkg/{_date}.gkg.csv.zip")
        # write to file
        with open(f"{_date}.gkg.csv.zip","wb") as f:                                          
            f.write(resp.read())
        # unzip compressed file and extract contents
        with zipfile.ZipFile(f'/content/{_date}.gkg.csv.zip', 'r') as zip_ref:
            zip_ref.extractall('/content')
        # read news contents from date
        _alt_gkg = pd.read_csv(f'/content/{_date}.gkg.csv', lineterminator='\n', delimiter='\t')
        # append day's news to df for batch
        other_gkg = other_gkg.append(_alt_gkg)

    # reset index
    other_gkg.reset_index(inplace=True, drop=True)
    # to datetime object
    other_gkg.DATE = pd.to_datetime(other_gkg.DATE, format='%Y%m%d')
    # shape of unmodified gkg from batch
    print('other gkg:', other_gkg.shape)
    # get country specific data from batch
    country_idx_alt = [idx for idx,val in enumerate(other_gkg.LOCATIONS.str.contains('[kK]enya', regex=True)) if val == True]
    print('country specific articles:', len(country_idx_alt))
    print('country specific indices',country_idx_alt)  
    # get news for selected country -- ALT
    country_news_alt = other_gkg.loc[other_gkg.index.isin(country_idx_alt)].copy()
    print('Number of country relevant articles:', country_news_alt.shape[0])
    # reset index
    country_news_alt.reset_index(inplace=True, drop=True)
    # save batch to file
    batch=7
    country_news_alt.to_csv(f'country_news_alt_batch_{batch}.csv')
    # unpdate indices for next batch
    idx_date += batch_size_date

    # clear batch variables from memory
    del other_gkg
    gc.collect()
    del country_news_alt
    gc.collect()
    del _alt_gkg
    gc.collect()
    del resp
    gc.collect()

Batch: 0
2017-10-02
2017-10-03
2017-10-04
2017-10-05
2017-10-06
2017-10-07
2017-10-08
2017-10-09
2017-10-10
2017-10-11
2017-10-12
2017-10-13
2017-10-14
2017-10-15
2017-10-16
2017-10-17
2017-10-18
2017-10-19
2017-10-20
2017-10-21
2017-10-22
2017-10-23
2017-10-24
2017-10-25
2017-10-26
2017-10-27
2017-10-28
2017-10-29
2017-10-30
2017-10-31
2017-11-01
2017-11-02
2017-11-03
2017-11-04
2017-11-05
2017-11-06
2017-11-07
2017-11-08
2017-11-09
2017-11-10
2017-11-11
2017-11-12
2017-11-13
2017-11-14
2017-11-15
2017-11-16
2017-11-17
2017-11-18
2017-11-19
2017-11-20
2017-11-21
2017-11-22
2017-11-23
2017-11-24
2017-11-25
2017-11-26
2017-11-27
2017-11-28
other gkg: (7356225, 11)
country specific articles: 71541
country specific indices [60, 163, 193, 257, 394, 616, 731, 769, 786, 811, 845, 1443, 1535, 1551, 1974, 1990, 2012, 2026, 2068, 2118, 2154, 2297, 2435, 2441, 2443, 2938, 3074, 3097, 4000, 4119, 4227, 4275, 4361, 4702, 4745, 4867, 5301, 5345, 5554, 5617, 5682, 5730, 5788, 6096, 6213, 6250, 6621,

In [None]:
# append dataframes from each batch together

country_news = pd.read_csv('/content/country_news_alt_batch_0.csv', lineterminator='\n', index_col=0)

for batch in range(1,8):
    df = pd.read_csv(f'/content/country_news_alt_batch_{batch}.csv', lineterminator='\n', index_col=0)
    country_news = country_news.append(df)

country_news.reset_index(inplace=True, drop=True)

# save combined df
country_news.to_csv('kenya_news_gkg_AUG2016-NOV2017_with_text_IDs.csv')