In [56]:
import pandas as pd
import numpy as np
from functools import cache
from IPython.display import display


pd.set_option('display.max_colwidth', None)  # Show full content in each cell
pd.set_option('display.max_rows', None)      # Show all rows
pd.set_option('display.max_columns', None)   # Show all columns


sourceDf = pd.read_csv("./data/Europe-Central-Asia_2018-2024_Sep27.csv")
equipement_loss_df = pd.read_csv("./data/russia-ukraine-equipement-losses.csv")
print(sourceDf.columns)
print(equipement_loss_df.columns)

Index(['event_id_cnty', 'event_date', 'year', 'time_precision',
       'disorder_type', 'event_type', 'sub_event_type', 'actor1',
       'assoc_actor_1', 'inter1', 'actor2', 'assoc_actor_2', 'inter2',
       'interaction', 'civilian_targeting', 'iso', 'region', 'country',
       'admin1', 'admin2', 'admin3', 'location', 'latitude', 'longitude',
       'geo_precision', 'source', 'source_scale', 'notes', 'fatalities',
       'tags', 'timestamp'],
      dtype='object')
Index(['Date', 'Russia_Total', 'Change', 'Ukraine_Total', 'Change.1',
       'Ratio RU/UA', 'Russia_Destroyed', 'Ukraine_Destroyed', 'Unnamed: 8',
       'Russia_Damaged', 'Ukraine_Damaged', 'Ukraine_Abandoned', 'Unnamed: 12',
       'Russia_Abandoned', 'Unnamed: 14', 'Russia_Captured',
       'Ukraine_Captured', 'Unnamed: 17', 'Russia_Tanks', 'Ukraine_Tanks',
       'Russia_Tank_Capture', 'Ukraine_Tank_Capture', 'Unnamed: 22',
       'Russia_AFV', 'Ukraine_AFV', 'Russia_AFV_Capture',
       'Ukraine_AFV_Capture', 'Unnamed:

In [57]:
unique_sub_event = sourceDf['sub_event_type'].unique()
unique_sub_event = np.sort(unique_sub_event)

non_war_related_event  = [
    'Agreement',
    'Arrests',
    'Mob violence',
    'Excessive force against protesters',
    'Peaceful protest',
    'Protest with intervention',
    'Sexual violence',
    'Violent demonstration',
    'Looting/property destruction'
]

war_related_event =  [event for event in unique_sub_event if event not in non_war_related_event ]


ukraine_russia_events = sourceDf[(sourceDf["country"] == "Ukraine") | (sourceDf["country"] == "Russia")]
ukraine_war_events = ukraine_russia_events[ukraine_russia_events["sub_event_type"].isin(war_related_event)]

id_date_note_triplet = ukraine_war_events[["event_date", "notes", "event_id_cnty"]]
id_date_note_triplet = id_date_note_triplet.sort_values(by = ["event_date", "notes", "event_id_cnty"])


display(id_date_note_triplet.head(5))


Unnamed: 0,event_date,notes,event_id_cnty
424080,2018-01-01,"On 1 January 2018, Military Forces of Ukraine fired at the outskirts of Donetsk (Volvo Centre). The OSCE SMM observed a total of 9 bursts of fire involving small arms near Donetsk.",UKR18
424125,2018-01-01,"On 1 January 2018, Military Forces of Ukraine fired with 82 mm mortars at NAF near Yasynuvata. This day one DPR soldier was injured in Donbass region in the result of the shelling.",UKR32
424124,2018-01-01,"On 1 January 2018, Military Forces of Ukraine fired with 82 mm mortars at NAF rebel forces near Sosnovskoye. This day one DPR soldier was injured in Donbass region in the result of the shelling.",UKR29
424062,2018-01-01,"On 1 January 2018, Military Forces of Ukraine fired with recoilless guns, automatic grenade launchers and small arms at LPR Luhansk People's Militia positions (NAF) near Kalynivka.",UKR22
424074,2018-01-01,"On 1 January 2018, Military Forces of Ukraine fired with unspecified artillery on Zaytseve in Donetsk region. No casualties were reported.",UKR33


In [58]:
max_edit_distance = 5

def is_similar(rep_sentence, cur_sentence):

    rep_words = rep_sentence.split(" ")
    cur_words = cur_sentence.split(" ")

    @cache
    def compute_edit_distance(i, j):
        if i >= len(rep_words):
            return len(cur_words) - j
        
        if j >= len(cur_words):
            return len(rep_words) - j

        if rep_words[i] == cur_words[j]:
            return compute_edit_distance(i+1, j+1)
        
        return 1 + min(
            compute_edit_distance(i+1, j+1),
            compute_edit_distance(i, j+1),
            compute_edit_distance(i+1, j)
        )


    # seems that related notes all have a comment of "coded to xxx"
    keyword = "coded to"

    if keyword not in rep_sentence or (keyword not in cur_sentence):
        return False
    if rep_words[:5] != cur_words[:5]:
        return False

    dist =  compute_edit_distance(0,0) 

    return dist < max_edit_distance


s1 = "On 4 October 2022, following clashes with Russian forces, Ukrainian forces regained control over Biliaivka, Kherson region. 80 Ukrainian soldiers were killed across 17 locations. Casualties on Russian side unknown. [Russian MoD reported 80 fatalities. Coded as 10 fatalities split across 17 events. 1 fatality coded to this event]"
s2 = "On 4 October 2022, following clashes with Russian forces, Ukrainian forces regained control over Chereshneve, Kherson region. 80 Ukrainian soldiers were killed across 17 locations. Casualties on Russian side unknown. [Russian MoD reported 80 fatalities. Coded as 10 fatalities split across 17 events. 1 fatality coded to this event]"

print(is_similar(s1, s2))

        

True


In [59]:




# compute a mapping of event_id_cnty -> event_id_cnty where similar event should be mapped to the same representative event_id_cnty
# we already sorted the records so similiar notes should be adjacent

mapping = dict()

prev_event_id = None
prev_note = "XXX XXX"
prev_date = None


for row in id_date_note_triplet.itertuples():
    cur_event_id = row.event_id_cnty
    cur_date = row.event_date
    cur_note = row.notes


    if cur_date == prev_date and is_similar(prev_note, cur_note):
        mapping[cur_event_id] = prev_event_id
    else:
        prev_date = cur_date
        prev_note = cur_note
        prev_event_id = cur_event_id
        mapping[cur_event_id] = cur_event_id

# print({x:y for x, y in mapping.items() if x != y})


In [53]:
# targets = "|".join(["UKR47730", "UKR47716"])
# ukraine_war_events = ukraine_war_events[ukraine_war_events["event_id_cnty"].str.contains(targets)]
# display(ukraine_war_events[ukraine_war_events["event_id_cnty"].str.contains(targets)])


# incorporate this mapping as a new key in the dataframe


def group_key(x):
    return mapping[x]


projected = ukraine_war_events[["event_id_cnty", "event_date", "sub_event_type", "actor1", "actor2", "latitude", "notes", "longitude", "fatalities"]]
aggregated = projected.groupby(projected["event_id_cnty"].apply(group_key)).agg({
    "event_id_cnty": "first",
    "event_date": "first",
    "sub_event_type": "first",
    "actor1": "first",
    "actor2": "first",
    "notes": "first",
    "latitude": lambda l: ','.join([str(x) for x in l]),
    "longitude": lambda l: ','.join([str(x) for x in l]),
    "fatalities": 'sum'
})

display(aggregated.head(10))





Unnamed: 0_level_0,event_id_cnty,event_date,sub_event_type,actor1,actor2,notes,latitude,longitude,fatalities
event_id_cnty,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
RUS1,RUS1,2018-01-03,Armed clash,Unidentified Armed Group (Russia),Police Forces of Russia (2000-),"On 3 January 2018, Abubakar Ustarkhanov, the local police chief from Avtury village (Shalinskiy District, Chechnya) was shot dead, as he was leaving a local shop and the murderer allegedly escaped in a police car.",43.0948,46.0043,1
RUS10,RUS10,2018-01-08,Grenade,Unidentified Armed Group (Russia),,"On 8 January 2018, a grenade was thrown in Nazran on a house of the family of Magomed Khazbiyev, a member of opposition in Ingushetia, destroying a car.",43.2152,44.7761,0
RUS1000,RUS1000,2018-08-03,Remote explosive/landmine/IED,Unidentified Armed Group (Russia),,"On 3 August 2018, an explosive device damaged the entrance to a national Pension Fund office in Kaluga city, Kaluga Municipality, Kaluga Oblast. No information on attackers, no casualties.",54.5293,36.2754,0
RUS10000,RUS10000,2023-05-29,Disrupted weapons use,Military Forces of Russia (2000-),Military Forces of Ukraine (2019-) Air Force,"Interception: On 29 May 2023, Russian air defense shot down a (presumably) Ukrainian drone near Staroselye.",50.705,35.4653,0
RUS10001,RUS10001,2023-05-30,Disrupted weapons use,Military Forces of Russia (2000-),Military Forces of Ukraine (2019-) Air Force,"Interception: On 30 May 2023, Russian air defense shot down a (presumably) Ukrainian drone over Antonovka.",50.5501,35.7583,0
RUS10002,RUS10002,2023-05-30,Disrupted weapons use,Military Forces of Russia (2000-),Military Forces of Ukraine (2019-) Air Force,"Interception: On 30 May 2023, Russian air defense shot down several (presumably) Ukrainian drones over Klimovskiy district. (coded to Klimovo)",52.3833,32.1833,0
RUS10003,RUS10003,2023-05-27,Disrupted weapons use,Military Forces of Russia (2000-),Unidentified Armed Group (Russia),"Interception: On 27 May 2023, Russian air defense shot down several unidentified drones over Ilskiy, close to the Ilskiy oil refinery plant.",44.8423,38.5681,0
RUS10004,RUS10004,2023-05-30,Disrupted weapons use,Military Forces of Russia (2000-),Unidentified Armed Group (Russia),"Interception: On 30 May 2023, Russian air defense shot down at least 8 drones over Moscow and Moscow region, with some sources claiming that overall more than 30 drones were intercepted. Several drones were shot down over Ilyinskoe, Moscow region, in the area where Russia's elite have luxurious property. While Moscow blamed Ukraine for the attack, Ukraine denied responsibility. According to some analysts, one of the drones captured on video resembled a Ukrainian UJ-22 drone.",55.757,37.242,0
RUS10005,RUS10005,2023-05-30,Disrupted weapons use,Military Forces of Russia (2000-),Unidentified Armed Group (Russia),"Interception: On 30 May 2023, Russian air defense shot down at least 8 drones over Moscow and Moscow region, with some sources claiming that overall more than 30 drones were intercepted. One drone was shot down over Moscow - Novomoskovsky, Moscow region. While Moscow blamed Ukraine for the attack, Ukraine denied responsibility. According to some analysts, one of the drones captured on video resembled a Ukrainian UJ-22 drone.",55.5987,37.3517,0
RUS10006,RUS10006,2023-06-01,Disrupted weapons use,Military Forces of Russia (2000-),Unidentified Armed Group (Russia),"Interception: On 1 June 2023, Russian air defense shot down several unidentified drones over Kursk.",51.7373,36.1873,0


In [60]:

equipement_loss_df = equipement_loss_df.rename(columns={'Date': 'event_date'})
joined = pd.merge(aggregated, equipement_loss_df, on="event_date", how="left")

display(joined.head(5))

Unnamed: 0,event_id_cnty,event_date,sub_event_type,actor1,actor2,notes,latitude,longitude,fatalities,Russia_Total,Change,Ukraine_Total,Change.1,Ratio RU/UA,Russia_Destroyed,Ukraine_Destroyed,Unnamed: 8,Russia_Damaged,Ukraine_Damaged,Ukraine_Abandoned,Unnamed: 12,Russia_Abandoned,Unnamed: 14,Russia_Captured,Ukraine_Captured,Unnamed: 17,Russia_Tanks,Ukraine_Tanks,Russia_Tank_Capture,Ukraine_Tank_Capture,Unnamed: 22,Russia_AFV,Ukraine_AFV,Russia_AFV_Capture,Ukraine_AFV_Capture,Unnamed: 27,Russia_IFV,Ukraine_IFV,Unnamed: 30,Russia_APC,Ukraine_APC,Unnamed: 33,Russia_IMV,Ukraine_IMV,Unnamed: 36,Russia_Engineering,Ukraine_Engineering,Unnamed: 39,Russia_Coms,Ukraine_Coms,Unnamed: 42,Russia_Vehicles,Ukraine_Vehicles,Unnamed: 45,Russia_Aircraft,Ukraine_Aircraft,Unnamed: 48,Russia_Infantry,Ukraine_Infantry,Unnamed: 51,Russia_Logistics,Ukraine_Logistics,Unnamed: 54,Russia_Armor,Ukraine_Armor,Unnamed: 57,Russia_Antiair,Ukraine_Antiair,Unnamed: 60,Russia_Artillery,Ukraine_Artillery,Unnamed: 63,UNHCR_Ukraine_Border,UNHCR_Ukraine_Refugees,UNHCR_Returning_Ukraine_Refugees
0,RUS1,2018-01-03,Armed clash,Unidentified Armed Group (Russia),Police Forces of Russia (2000-),"On 3 January 2018, Abubakar Ustarkhanov, the local police chief from Avtury village (Shalinskiy District, Chechnya) was shot dead, as he was leaving a local shop and the murderer allegedly escaped in a police car.",43.0948,46.0043,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,RUS10,2018-01-08,Grenade,Unidentified Armed Group (Russia),,"On 8 January 2018, a grenade was thrown in Nazran on a house of the family of Magomed Khazbiyev, a member of opposition in Ingushetia, destroying a car.",43.2152,44.7761,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,RUS1000,2018-08-03,Remote explosive/landmine/IED,Unidentified Armed Group (Russia),,"On 3 August 2018, an explosive device damaged the entrance to a national Pension Fund office in Kaluga city, Kaluga Municipality, Kaluga Oblast. No information on attackers, no casualties.",54.5293,36.2754,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,RUS10000,2023-05-29,Disrupted weapons use,Military Forces of Russia (2000-),Military Forces of Ukraine (2019-) Air Force,"Interception: On 29 May 2023, Russian air defense shot down a (presumably) Ukrainian drone near Staroselye.",50.705,35.4653,0,10469.0,21.0,3451.0,8.0,3.03,6898.0,2249.0,,352.0,214.0,94.0,,399.0,,2820.0,894.0,,2002.0,505.0,544.0,139.0,,862.0,277.0,265.0,79.0,,2366.0,543.0,,312.0,265.0,,189.0,316.0,,306.0,57.0,,242.0,13.0,,2518.0,610.0,,172.0,96.0,,2867.0,1124.0,,548.0,70.0,,2864.0,782.0,,169.0,178.0,,732.0,308.0,,,,
4,RUS10001,2023-05-30,Disrupted weapons use,Military Forces of Russia (2000-),Military Forces of Ukraine (2019-) Air Force,"Interception: On 30 May 2023, Russian air defense shot down a (presumably) Ukrainian drone over Antonovka.",50.5501,35.7583,0,10468.0,-1.0,3451.0,0.0,3.03,6897.0,2249.0,,352.0,214.0,94.0,,399.0,,2820.0,894.0,,2001.0,505.0,544.0,139.0,,862.0,277.0,265.0,79.0,,2366.0,543.0,,312.0,265.0,,189.0,316.0,,306.0,57.0,,242.0,13.0,,2518.0,610.0,,172.0,96.0,,2867.0,1124.0,,548.0,70.0,,2863.0,782.0,,169.0,178.0,,732.0,308.0,,,,
