In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import json


In [2]:

#get 2024 monthly data collections from ucdp downloads page 
url = 'https://ucdp.uu.se/downloads/'
reqs = requests.get(url)
soup = BeautifulSoup(reqs.text, 'html.parser')
pattern = re.compile(r"https://ucdp.uu.se/downloads/candidateged/GEDEvent_v24_.*.csv")
links = soup.find_all("a", href=pattern)
 
urls = []
data=pd.DataFrame()
for link in links:
    urls.append(link.get('href'))

#concat all the available dasets. Add reference to the original csv
for url in urls:
    tmp=pd.read_csv(url)
    tmp['data_orig']=url
    data=pd.concat([data,tmp],ignore_index=True)



In [7]:
#data cleansing 
data['country']=data.country.str.replace(" \(.*\)","", regex=True)
#remove events with date precision >1week
data=data[data['date_prec']<4]
#remove violent events outside conflicts
data=data[(~data['conflict_name'].str.contains('XXX'))&(data.date_prec<5)]

#retrive countries geometries, locations ids (iso-alpha codes) ---> original data source from kaggle
# Opening JSON file
f = open('../data/world-countries.json')
# returns JSON object as
# a dictionary
geoj = json.load(f)

#merge with main dataset
countryIsoAlpha=pd.DataFrame({'country':[x['properties']['name'] for x in geoj['features']],'iso_alpha':[x['id'] for x in geoj['features']]})
data=pd.merge(data,countryIsoAlpha, on='country', how='left')


#retrive countries gps location to assign center GPS coordinates for each country  ---> original data source from kaggle
# https://www.kaggle.com/datasets/paultimothymooney/latitude-and-longitude-for-every-country-and-state
#merge with main dataset
gpsTable=pd.read_csv('../data/world_country_latitude_and_longitude.csv')
data=pd.merge(data,gpsTable, on='country', how='left')

#post proc "date_start". For data aggregation purpose
data['date_start']=pd.DatetimeIndex(data['date_start'])
data['date_start_m']=pd.DatetimeIndex(data['date_start']).month
data['date_start_y']=pd.DatetimeIndex(data['date_start']).year
data['date_start_w']=pd.to_datetime(data['date_start']).dt.isocalendar().week
data['date_start_wymd']=pd.to_datetime(data['date_start']).dt.to_period('W').astype(str).str.replace('/.*[0-9]$','',regex=True)
#remove events with hpy date_start before 2024
data=data[data.date_start_y == 2024]

#create ref link to UCDP dedicates page for each country 
data['country_link']='https://ucdp.uu.se/country/'+data.country_id.astype(str)

#export the final dataset - csv format
data.to_csv('../data/data_2024.csv',index=False)