# Analysis of GDELT violence data

## Query

```sql
SELECT Year, 
	   ActionGeo_CountryCode AS CountryCode,
	   EventRootCode,
	   EventCode,
	   COUNT(GLOBALEVENTID) AS SumEvents,
	   ANY_VALUE(GoldsteinScale) AS GoldsteinScale,
	   AVG(NumMentions) as AvgNumMentions,
	   SUM(NumMentions) as SumNumMentions,
	   AVG(AvgTone) as AvgAvgTone,
FROM `gdelt-bq.full.events`
WHERE 
	EventRootCode IN ("17", "18", "19") 
	AND Year >= 1979
GROUP BY Year, ActionGeo_CountryCode, EventRootCode, EventCode
ORDER BY Year
```

In [1]:
import pandas as pd
import numpy as np

### Get CAMEO event codes. 

In [2]:
!curl -o data/CAMEO_codes.txt https://www.gdeltproject.org/data/lookups/CAMEO.eventcodes.txt 

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 12157  100 12157    0     0  13257      0 --:--:-- --:--:-- --:--:-- 13242


In [3]:
cameo_codes = pd.read_csv('data/CAMEO_codes.txt', sep='\t', 
                          names=['code', 'descr'], skiprows=1,
                         dtype=str)

cameo_codes['code'] = cameo_codes['code'].astype(str)

cameo_codes.head()

Unnamed: 0,code,descr
0,1,MAKE PUBLIC STATEMENT
1,10,"Make statement, not specified below"
2,11,Decline comment
3,12,Make pessimistic comment
4,13,Make optimistic comment


### Load query data. 

In [4]:
data = pd.read_csv('data/query.csv')
data['EventCode'] = data['EventCode'].astype(str)
data['EventRootCode'] = data['EventRootCode'].astype(str)

if 'f0_' in data.columns:
    data.rename({'f0_': 'GoldsteinScale'},
                axis=1, inplace=True)

data.shape

(171710, 9)

In [5]:
data.head()

Unnamed: 0,Year,CountryCode,EventRootCode,EventCode,SumEvents,GoldsteinScale,AvgNumMentions,SumNumMentions,AvgAvgTone
0,1979,AR,17,173,57,-5.0,3.754386,214,4.962598
1,1979,SA,18,186,7,-10.0,2.571429,18,3.581601
2,1979,ET,18,183,3,-10.0,4.333333,13,3.660743
3,1979,RQ,19,190,4,-10.0,4.75,19,3.061159
4,1979,ID,19,190,26,-10.0,3.961538,103,3.262348


### Merge event and event root description. 

In [6]:
data = data.merge(cameo_codes, left_on='EventCode', right_on='code', how='left')
data = data.merge(cameo_codes, left_on='EventRootCode', right_on='code', how='left')

data.drop(['code_x', 'code_y'], axis=1, inplace=True)

data.rename({'descr_x': 'EventDescr',
             'descr_y': 'EventRootDescr'},
            axis=1, inplace=True)

In [7]:
data.head()

Unnamed: 0,Year,CountryCode,EventRootCode,EventCode,SumEvents,GoldsteinScale,AvgNumMentions,SumNumMentions,AvgAvgTone,EventDescr,EventRootDescr
0,1979,AR,17,173,57,-5.0,3.754386,214,4.962598,"Arrest, detain, or charge with legal action",COERCE
1,1979,SA,18,186,7,-10.0,2.571429,18,3.581601,Assassinate,ASSAULT
2,1979,ET,18,183,3,-10.0,4.333333,13,3.660743,"Conduct suicide, car, or other non-military bo...",ASSAULT
3,1979,RQ,19,190,4,-10.0,4.75,19,3.061159,"Use conventional military force, not specified...",FIGHT
4,1979,ID,19,190,26,-10.0,3.961538,103,3.262348,"Use conventional military force, not specified...",FIGHT


### Some countries are NULL. 

In [8]:
data.isnull().sum()

Year                 0
CountryCode       1291
EventRootCode        0
EventCode            0
SumEvents            0
GoldsteinScale       0
AvgNumMentions       0
SumNumMentions       0
AvgAvgTone           0
EventDescr           0
EventRootDescr       0
dtype: int64

### Download and merge country names. 

In [9]:
!curl -o data/country_names.csv https://raw.githubusercontent.com/mysociety/gaze/master/data/fips-10-4-to-iso-country-codes.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  5064  100  5064    0     0  19858      0 --:--:-- --:--:-- --:--:-- 19858


In [10]:
cn = pd.read_csv('data/country_names.csv')
cn = cn.drop('ISO 3166', axis=1)
cn.columns = ['CountryCode', 'CountryName']
cn.head()

Unnamed: 0,CountryCode,CountryName
0,AF,Afghanistan
1,AX,Akrotiri
2,AL,Albania
3,AG,Algeria
4,AQ,American Samoa


In [11]:
data = data.merge(cn, on='CountryCode', how='left')

### Downloading and merging total values. 

In [12]:
!curl -o data/yearly_country_totals.csv http://data.gdeltproject.org/normfiles/yearly_country.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  136k  100  136k    0     0   174k      0 --:--:-- --:--:-- --:--:--  174k


In [13]:
totals = pd.read_csv('data/yearly_country_totals.csv', names=['Year', 'CountryCode', 'TotalEvents'])
totals.head()

Unnamed: 0,Year,CountryCode,TotalEvents
0,1920,,13636
1,1920,AA,3
2,1920,AC,63
3,1920,AE,1585
4,1920,AF,3523


In [14]:
data = data.merge(totals, on=['Year', 'CountryCode'], how='left')

### Compute normalized events count (over 1000 events). 

In [15]:
data['NormalizedEvents1000'] = data['SumEvents'] / data['TotalEvents'] * 10**3

### Removing rows with NULL values. 

In [16]:
before = data.shape[0]
data = data.dropna()
after = data.shape[0]
removed = before - after
percent_removed = round(removed / before, 2) * 100
print(f"Removed {before - after} rows ({percent_removed}% of {before} rows.)")

Removed 4641 rows (3.0% of 173011 rows.)


In [17]:
data.isnull().sum().sum()

0

In [18]:
data.shape

(168370, 14)

### Reordering columns.

In [19]:
prev = len(data.columns)
data = data[['Year', 'CountryCode', 'CountryName', 
             'SumEvents', 'TotalEvents', 'NormalizedEvents1000', 
             'EventRootCode', 'EventRootDescr',
             'EventCode', 'EventDescr', 
             'GoldsteinScale', 'AvgNumMentions', 'SumNumMentions', 'AvgAvgTone'
             ]]

assert len(data.columns) == prev

In [20]:
data.head()

Unnamed: 0,Year,CountryCode,CountryName,SumEvents,TotalEvents,NormalizedEvents1000,EventRootCode,EventRootDescr,EventCode,EventDescr,GoldsteinScale,AvgNumMentions,SumNumMentions,AvgAvgTone
0,1979,AR,Argentina,57,948,60.126582,17,COERCE,173,"Arrest, detain, or charge with legal action",-5.0,3.754386,214,4.962598
1,1979,SA,Saudi Arabia,7,2691,2.601263,18,ASSAULT,186,Assassinate,-10.0,2.571429,18,3.581601
2,1979,ET,Ethiopia,3,1374,2.183406,18,ASSAULT,183,"Conduct suicide, car, or other non-military bo...",-10.0,4.333333,13,3.660743
3,1979,RQ,Puerto Rico,4,170,23.529412,19,FIGHT,190,"Use conventional military force, not specified...",-10.0,4.75,19,3.061159
4,1979,ID,Indonesia,26,1793,14.500837,19,FIGHT,190,"Use conventional military force, not specified...",-10.0,3.961538,103,3.262348


In [21]:
data.isna().sum().sum()

0

## Export

In [22]:
data.to_csv('gdelt_conflict/gdelt_conflict_10.csv', index=None)