In [8]:
import pandas as pd
import numpy as np


pd.set_option('display.max_colwidth', None)  # Show full content in each cell
pd.set_option('display.max_rows', None)      # Show all rows
pd.set_option('display.max_columns', None)   # Show all columns


sourceDf = pd.read_csv("./data/Europe-Central-Asia_2018-2024_Sep27.csv")
print(sourceDf.columns)

Index(['event_id_cnty', 'event_date', 'year', 'time_precision',
       'disorder_type', 'event_type', 'sub_event_type', 'actor1',
       'assoc_actor_1', 'inter1', 'actor2', 'assoc_actor_2', 'inter2',
       'interaction', 'civilian_targeting', 'iso', 'region', 'country',
       'admin1', 'admin2', 'admin3', 'location', 'latitude', 'longitude',
       'geo_precision', 'source', 'source_scale', 'notes', 'fatalities',
       'tags', 'timestamp'],
      dtype='object')
1.26.2


**Data Quality Check**
- all columns are populated

In [10]:

import reverse_geocoder as rg
import pycountry
from functools import cache

#trim leading and trailing spaces
sourceDf["notes"] = sourceDf["notes"].str.strip()


#looks like all columns are populated
for column in sourceDf.columns:
    emptyCount = sourceDf[sourceDf[column].isnull() | sourceDf[column] == ''][column].count()
    if emptyCount > 0:
        print(f"{column} empty count: {emptyCount}")



def check_location_mismatch():

    locations = sourceDf[["country","latitude", "longitude"]]

    @cache
    def get_country_name(country_code):
        country = pycountry.countries.get(alpha_2=country_code)
        return  country.name if country else "Unknown"



    # compare with reverse_geocoder to see any coordinate to country mismatches
    # most mismatch happens on border towns and our dataset is correct (after eyeballing on google map)

    for row in locations.itertuples():
        given_country = row.country
        if given_country not in ["Ukraine", "Russia"]:
            continue
        coordinates = (row.latitude, row.longitude)  
        location = rg.search(coordinates) 
        country_code = location[0]['cc']
        computed_country_name =  get_country_name(country_code)

        # Russia == Russian Federation
        computed_country_name = "Russia" if computed_country_name == "Russian Federation" else computed_country_name
        if given_country != computed_country_name:
            print(f"Record country: {given_country}; Computed country: {computed_country_name}; Coordinate: {coordinates}")


# check_location_mismatch()


In [3]:
unique_countries = sourceDf['country'].unique()
unique_countries = np.sort(unique_countries)
print(unique_countries)

['Albania' 'Andorra' 'Armenia' 'Austria' 'Azerbaijan'
 'Bailiwick of Guernsey' 'Bailiwick of Jersey' 'Belarus' 'Belgium'
 'Bosnia and Herzegovina' 'Bulgaria' 'Croatia' 'Cyprus' 'Czech Republic'
 'Denmark' 'Estonia' 'Faroe Islands' 'Finland' 'France' 'Georgia'
 'Germany' 'Gibraltar' 'Greece' 'Greenland' 'Hungary' 'Iceland' 'Ireland'
 'Isle of Man' 'Italy' 'Kazakhstan' 'Kosovo' 'Kyrgyzstan' 'Latvia'
 'Liechtenstein' 'Lithuania' 'Luxembourg' 'Malta' 'Moldova' 'Monaco'
 'Montenegro' 'Netherlands' 'North Macedonia' 'Norway' 'Poland' 'Portugal'
 'Romania' 'Russia' 'San Marino' 'Serbia' 'Slovakia' 'Slovenia' 'Spain'
 'Sweden' 'Switzerland' 'Tajikistan' 'Turkmenistan' 'Ukraine'
 'United Kingdom' 'Uzbekistan' 'Vatican City']


In [4]:


unique_sub_event = sourceDf['sub_event_type'].unique()
unique_sub_event = np.sort(unique_sub_event)
print(unique_sub_event)

non_war_related_event  = [
    'Agreement',
    'Arrests',
    'Mob violence',
    'Excessive force against protesters',
    'Peaceful protest',
    'Protest with intervention',
    'Sexual violence',
    'Violent demonstration',
    'Looting/property destruction'
]

war_related_event =  [event for event in unique_sub_event if event not in non_war_related_event ]

['Abduction/forced disappearance' 'Agreement' 'Air/drone strike'
 'Armed clash' 'Arrests' 'Attack' 'Change to group/activity'
 'Disrupted weapons use' 'Excessive force against protesters'
 'Government regains territory' 'Grenade'
 'Headquarters or base established' 'Looting/property destruction'
 'Mob violence' 'Non-state actor overtakes territory'
 'Non-violent transfer of territory' 'Other' 'Peaceful protest'
 'Protest with intervention' 'Remote explosive/landmine/IED'
 'Sexual violence' 'Shelling/artillery/missile attack' 'Suicide bomb'
 'Violent demonstration']


**Scope Investigation**

Countries other than Ukraine and Russion are mostly not involved in direct conflicts.

Here are some ways a third country can be involved in the war by eye-balling related notes mentioning Ukraine/Russia:

- Russia moving/deploying/firing weapons in Belarus
- A third-country shipping supplies to Russia/Ukraine
- Russia missile/drone crossed or fell in Belarus/Moldova/Romania (this might be less significant to the war intensity)


Overall, we may revisit these records later


In [5]:
# Does records of countires other than Ukraine, Russia and Belarus contain events directed related to Ukraine wars?

immediately_related_countries = ["Ukraine", "Russia"]
other_country_records = sourceDf[~sourceDf["country"].isin(immediately_related_countries)]

#filter for notes containing related keywords
related_keywords = "|".join(["Ukraine", "Russia", "Ukrainian"])
other_country_related_notes = other_country_records[other_country_records["notes"].str.contains(related_keywords, case=False, na=False)]

#filter out notes containing irrelevent keywords
unrelated_keywords = "|".join([ "Azerbaijan", "Armenia", "Israel", "Kazakh", "Displacement", "Security measures", "Non-violent activity"])
other_country_related_notes = other_country_related_notes[~other_country_related_notes["notes"].str.contains(unrelated_keywords, case=False, na=False)]

other_country_related_notes = other_country_related_notes[other_country_related_notes["sub_event_type"].isin(war_related_event)]
notes = other_country_related_notes["notes"].tolist()
notes.sort()
print("\n".join(notes))

Around 1 February 2024 (week of), police physically assaulted a male citizen of Ukraine at the police detention facility in Talgar township of Almaty region after police arrest for illegal cultivation of marijuana at his residence in the village of Besagash.
Around 1 January 2024 (month of), Belarusian state security officers beat up 12 detainees in Drogichinskiy district (coded to Drahichyn (Drogichinskiy, Belarus)). The detainees were accused of collaborating with Ukrainian security services.
Around 1 May 2023 (beginning of month), Russian journalist and activists felt sick after her stay in a hotel in Prague. Further investigation revealed poisoning by a nerve agent, and linked it with the Russian government.
Around 13 April 2023 (week of), a 40-year-old Uzbekistan-native male citizen of Russia was detained by police and allegedly tortured while in custody at the city police department detention facility near Yashnobod district of Tashkent city in connection with a victim's alleged 

**Dataset Basic Features**

In [7]:
ukraine_russia_events = sourceDf[(sourceDf["country"] == "Ukraine") | (sourceDf["country"] == "Russia")]

print("total relevant event count:", ukraine_russia_events["country"].count())
print("total war related event count:", ukraine_russia_events[ukraine_russia_events["sub_event_type"].isin(war_related_event)]["event_id_cnty"].count())
print("min date:", ukraine_russia_events["event_date"].min())
print("max date:", ukraine_russia_events["event_date"].max())

total relevant event count: 209058
total war related event count: 192870
min date: 2018-01-01
max date: 2024-09-27
