In [1]:
import gdelt
import pandas as pd
import datetime
from dateutil import parser

In [2]:
gd =  gdelt.gdelt(version=2)

In [None]:
events = gd.Search(['2020 November 18'],table='mentions',output='pd',normcols=True,coverage=False)
# '2020 November 17' Errors Out

In [None]:
events

In [17]:
events.columns

Index(['globaleventid', 'sqldate', 'monthyear', 'year', 'fractiondate',
       'actor1code', 'actor1name', 'actor1countrycode', 'actor1knowngroupcode',
       'actor1ethniccode', 'actor1religion1code', 'actor1religion2code',
       'actor1type1code', 'actor1type2code', 'actor1type3code', 'actor2code',
       'actor2name', 'actor2countrycode', 'actor2knowngroupcode',
       'actor2ethniccode', 'actor2religion1code', 'actor2religion2code',
       'actor2type1code', 'actor2type2code', 'actor2type3code', 'isrootevent',
       'eventcode', 'cameocodedescription', 'eventbasecode', 'eventrootcode',
       'quadclass', 'goldsteinscale', 'nummentions', 'numsources',
       'numarticles', 'avgtone', 'actor1geotype', 'actor1geofullname',
       'actor1geocountrycode', 'actor1geoadm1code', 'actor1geoadm2code',
       'actor1geolat', 'actor1geolong', 'actor1geofeatureid', 'actor2geotype',
       'actor2geofullname', 'actor2geocountrycode', 'actor2geoadm1code',
       'actor2geoadm2code', 'actor2geo

In [115]:
#China (CHN), Japan (JPN), India (IND), Vietnam (VNM), Philippines (PHL), Taiwan (TWN), US (USA)
#events.loc[events['actor2code'] == "TWN"]#.loc[events['actor1code'] == "JUD"]
test = ["CHN", "JPN", "IND", "VNM", "PHL", "TWN", "USA"]
#events.query('actor1code in {} & actor2code in {}'.format(test,test))

In [9]:
#qc1 = events.loc[events['actor2code'].isnull()].query('actor1code in {}'.format(test))
#qc2 = events.loc[events['actor1code'].isnull()].query('actor2code in {}'.format(test))
#events.query('actor1code in {0} & actor2code in {0}'.format(test))

In [204]:
class DatasetProcessor(object):
    
    """Wrapper for Gdelt with exception handling for dates"""
    
    def __init__(self, dateRange: list, actorCodes: list, outputDir: str):
        self.dateRange = self._format_date_range(dateRange) #list
        self.actorCodes = actorCodes #list
        self.outputDir = outputDir
        self.listOfQueryDates = self.dateRange
        if(len(self.dateRange) > 1):
            self._get_list_of_query_dates()
        
    def _format_date_range(self, dateList):
        return list(map(parser.parse, dateList))
    
    def _get_query_date_delta(self):
        startDate = self.dateRange[0]
        endDate = self.dateRange[1]
        delta = endDate-startDate
        return delta.days
    
    def _get_list_of_query_dates(self):
        nextDate = self.dateRange[0]
        queryDates = [nextDate]
        for i in range(self._get_query_date_delta()): 
            nextDate += datetime.timedelta(days=1)
            queryDates.append(nextDate)
        self.listOfQueryDates = queryDates
    
    def query_date(self, date):
        events = gd.Search(['{} {} {}'.format(date.year, date.month, date.day)],table='events',output='pd',normcols=True,coverage=False)
        qc1 = events.loc[events['actor2code'].isnull()].query('actor1code in {0} | actor1countrycode in {0}'.format(self.actorCodes))
        qc2 = events.loc[events['actor1code'].isnull()].query('actor2code in {0} | actor2countrycode in {0}'.format(self.actorCodes))
        qc3 = events.query('(actor1code in {0} | actor1countrycode in {0}) & (actor2code in {0} | actor2countrycode in {0})'.format(self.actorCodes))
        result = pd.concat([qc1, qc2, qc3])
        return result
    
    
    def query_dates(self):
        dataFrames = []
        exceptions = []
        for date in self.listOfQueryDates:
            try:
                data = self.query_date(date)
                dataFrames.append(data)
            except:
                exceptions.append(date)
        sortedDataFrame = pd.concat(dataFrames).sort_values('globaleventid')
        columns = ['year'] #Filters out null year
        filter_ = (sortedDataFrame[columns] >= self.listOfQueryDates[0].year).all(axis=1)
        return sortedDataFrame[filter_], exceptions
    

dateRange = ['2021 January 1', '2021 January 31']
actorCodes = ["CHN", "JPN", "IND", "VNM", "PHL", "TWN", "USA"]
#condition = 'actor1code in {} & actor2code in {}'.format(actorCodes, actorCodes)
dp = DatasetProcessor(dateRange, actorCodes, "output")

In [205]:
data, exceptions = dp.query_dates()

In [206]:
data #68K

Unnamed: 0,globaleventid,sqldate,monthyear,year,fractiondate,actor1code,actor1name,actor1countrycode,actor1knowngroupcode,actor1ethniccode,...,actiongeotype,actiongeofullname,actiongeocountrycode,actiongeoadm1code,actiongeoadm2code,actiongeolat,actiongeolong,actiongeofeatureid,dateadded,sourceurl
13,962321134,20210101,202101,2021,2021.0027,,,,,,...,4,"Mongkok, Hong Kong (general), Hong Kong",HK,HK00,13031,22.316700,114.167000,-1353526,20210101234500,https://www.chinafile.com/viewpoint/no-hong-ko...
14,962321135,20210101,202101,2021,2021.0027,,,,,,...,4,"Mongkok, Hong Kong (general), Hong Kong",HK,HK00,13031,22.316700,114.167000,-1353526,20210101234500,https://www.chinafile.com/viewpoint/no-hong-ko...
15,962321136,20210101,202101,2021,2021.0027,,,,,,...,4,"Guangzhou, Guangdong, China",CH,CH30,13033,23.116700,113.250000,-1907161,20210101234500,https://www.chinafile.com/viewpoint/no-hong-ko...
40,962321161,20210101,202101,2021,2021.0027,,,,,,...,4,"Aurangabad, Maharashtra, India",IN,IN16,70178,19.883300,75.333300,-2089356,20210101234500,https://timesofindia.indiatimes.com/city/auran...
47,962321168,20210101,202101,2021,2021.0027,,,,,,...,3,"Bexar County, Texas, United States",US,USTX,,29.450200,-98.517000,1383800,20210101234500,https://www.ksat.com/news/local/2021/01/01/fiv...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
683,967254349,20210131,202101,2021,2021.0849,USAGOV,JOE BIDEN,USA,,,...,4,"Bangkok, Krung Thep Mahanakhon, Thailand",TH,TH40,26423,13.750000,100.517000,-3250083,20210131234500,https://www.msn.com/en-xl/news/other/arkhom-vo...
684,967254350,20210131,202101,2021,2021.0849,USALEG,IOWA,USA,,,...,3,"Des Moines, Iowa, United States",US,USIA,,41.600500,-93.609100,465961,20210131234500,https://www.amestrib.com/story/news/politics/2...
685,967254351,20210131,202101,2021,2021.0849,USALEG,JERSEY,USA,,,...,1,Jersey,JE,JE,,49.216667,-2.116667,JE,20210131234500,https://abc7ny.com/politics/up-close-johnson-a...
686,967254352,20210131,202101,2021,2021.0849,USALEG,IOWA,USA,,,...,3,"North Liberty, Iowa, United States",US,USIA,,41.749200,-91.597900,465945,20210131234500,https://www.amestrib.com/story/news/politics/2...


In [104]:
data.columns

Index(['globaleventid', 'sqldate', 'monthyear', 'year', 'fractiondate',
       'actor1code', 'actor1name', 'actor1countrycode', 'actor1knowngroupcode',
       'actor1ethniccode', 'actor1religion1code', 'actor1religion2code',
       'actor1type1code', 'actor1type2code', 'actor1type3code', 'actor2code',
       'actor2name', 'actor2countrycode', 'actor2knowngroupcode',
       'actor2ethniccode', 'actor2religion1code', 'actor2religion2code',
       'actor2type1code', 'actor2type2code', 'actor2type3code', 'isrootevent',
       'eventcode', 'cameocodedescription', 'eventbasecode', 'eventrootcode',
       'quadclass', 'goldsteinscale', 'nummentions', 'numsources',
       'numarticles', 'avgtone', 'actor1geotype', 'actor1geofullname',
       'actor1geocountrycode', 'actor1geoadm1code', 'actor1geoadm2code',
       'actor1geolat', 'actor1geolong', 'actor1geofeatureid', 'actor2geotype',
       'actor2geofullname', 'actor2geocountrycode', 'actor2geoadm1code',
       'actor2geoadm2code', 'actor2geo

In [207]:
minDateAdded = data.iloc[0].dateadded


def date_added_to_timestamp(dateAdded, minDateAdded):
    minDateAdded = data.iloc[0].dateadded
    return parser.parse(str(dateAdded)) - parser.parse(str(minDateAdded))

def convert_timedelta_minutes(duration):
    days, seconds = duration.days, duration.seconds
    hours = days * 24 + seconds / 3600
    minutes = (seconds % 3600) / 60
    return int(hours*60+minutes) #hours, minutes, seconds

def format_timestamps(dateAdded):
    #minDateAdded = minDateAdded
    delta = date_added_to_timestamp(dateAdded, minDateAdded)
    return convert_timedelta_minutes(delta)


In [86]:
#format_timestamps()

In [208]:
df = data[['actor1countrycode', 'eventcode','actor2countrycode']] #Removed Goldstein scale , 'goldsteinscale'

timeStampSeries = data['dateadded'].map(format_timestamps)
df['timestamp'] = pd.Series(timeStampSeries, index=df.index)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,actor1countrycode,eventcode,actor2countrycode,timestamp
13,,020,CHN,0
14,,020,CHN,0
15,,020,CHN,0
40,,180,IND,0
47,,014,USA,0
...,...,...,...,...
683,USA,071,USA,43200
684,USA,010,,43200
685,USA,010,,43200
686,USA,040,USA,43200


In [209]:
print(actorCodes)
uniqueEventCodes = sorted(df['eventcode'].unique())

['CHN', 'JPN', 'IND', 'VNM', 'PHL', 'TWN', 'USA']


In [210]:
def process_entity_codes(df):
    dataFrame = df.copy()
    dataFrame['actor1countrycode'].fillna(0, inplace=True)
    dataFrame['actor2countrycode'].fillna(0, inplace=True)
    #Convert codes in dataframe to index values
    entityMap = {**{'None':0}, **{k: v+1 for v, k in enumerate(actorCodes)}} 
    dataFrame['actor1countrycode'] = dataFrame['actor1countrycode'].replace(entityMap)
    dataFrame['actor2countrycode'] = dataFrame['actor2countrycode'].replace(entityMap)
    return dataFrame, entityMap


def process_relation_codes(df):
    dataFrame = df.copy()
    relationMap = {k: v for v, k in enumerate(df['eventcode'].unique())}
    dataFrame['eventcode'] = dataFrame['eventcode'].replace(relationMap)
    return dataFrame, relationMap
    
dfEventCodes, relationMap = process_relation_codes(df)
dfFinal, entityMap = process_entity_codes(dfEventCodes)

In [211]:
dfFinal

Unnamed: 0,actor1countrycode,eventcode,actor2countrycode,timestamp
13,0,0,1,0
14,0,0,1,0
15,0,0,1,0
40,0,1,3,0
47,0,2,7,0
...,...,...,...,...
683,7,26,7,43200
684,7,11,0,43200
685,7,11,0,43200
686,7,3,7,43200


In [212]:
def write_entity_map(entityMap, path='./data/GDELT/entity2id.txt'):
    output = ""
    for v, k in enumerate(entityMap):
        output += "{}\t{}\n".format(k,v)
    with open(path, 'w') as the_file:
        the_file.write(output)

def write_relation_map(relationMap, path='./data/GDELT/relation2id.txt'):
    output = ""
    for v, k in enumerate(relationMap):
        output += "{}\t{}\n".format(k,v)
    with open(path, 'w') as the_file:
        the_file.write(output)
        
def write_stat(relationMap, entityMap, path='./data/GDELT/stat.txt'):
    #num entities num relations
    numEntities = len(entityMap.keys())
    numRelations = len(relationMap.keys())
    with open(path, 'w') as the_file:
        the_file.write("{}\t{}\t0".format(numEntities, numRelations))
        
def write_data(dataFrame, path='./data/GDELT/test.txt'):
    final = dataFrame.copy()
    final['ignore'] = 0
    final.to_csv(path, header=None, index=None, sep=' ', mode='w')

#write_entity_map(entityMap)
#write_relation_map(relationMap)
#write_stat(relationMap, entityMap)
write_data(dfFinal)