In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df = pd.read_csv("combined_ge15days_official.csv", parse_dates=["date"])

In [6]:
df.columns

Index(['MAILITM_FID', 'EVENT_TYPE_NM', 'date', 'établissement_postal',
       'EVENT_TYPE_CD', 'next_établissement_postal', 'id',
       'duration_to_next_step', 'total_duration'],
      dtype='object')

In [3]:
df.shape

(34611297, 9)

In [4]:
distinct_values_count = df.nunique()

distinct_values_count = distinct_values_count.sort_values(ascending=True)

print(distinct_values_count)

EVENT_TYPE_NM                      26
EVENT_TYPE_CD                      26
id                                143
next_établissement_postal        3796
établissement_postal             3970
total_duration                1778999
duration_to_next_step         1928513
MAILITM_FID                   3048080
date                         15695109
dtype: int64


In [5]:

# Count the occurrences of each protocol
event_type = df['EVENT_TYPE_NM'].value_counts()

print(event_type)


EVENT_TYPE_NM
Recevoir envoi au bureau de livraison (Ent)                    6703810
Expédier envoi à adresse nationale (Ent)                       4977810
Expédier envoi à adresse nationale (Srt)                       4574729
Recevoir envoi au bureau d'échange (Ent)                       3171717
Insérer envoi dans sac (Srt)                                   2885480
Vaine tentative de livraison d'envoi (Ent)                     2879557
Livraison d'envoi (Ent)                                        2728635
Expédition d'envoi à l'étranger (EDI-reçu)                     2723907
Transmettre envoi à l'agent de livraison (Ent)                 1752347
Réception d'envoi du client (Srt)                              1741234
Enregistrer détails d'envoi au bureau d'échange (Srt)           290829
Recevoir envoi au bureau d'échange (Srt)                        100261
Expédier envoi à la douane (Ent)                                 45737
Renvoyer envoi de la douane (Ent)                              

In [7]:
# Define columns to ignore when checking duplicates
cols_ignore = ["duration_to_next_step", "total_duration", "id"]
cols_check  = [c for c in df.columns if c not in cols_ignore]

# 1. Identify all duplicate rows based on cols_check
dup_mask = df.duplicated(subset=cols_check, keep=False)

# 2. Number of such duplicate rows
num_duplicates = dup_mask.sum()
print(f"Number of duplicate rows (ignoring specified columns): {num_duplicates}")




Number of duplicate rows (ignoring specified columns): 1256


In [8]:
df = df.drop_duplicates(subset=cols_check, keep="first").reset_index(drop=True)

In [9]:
total_packages = df['MAILITM_FID'].nunique()


In [10]:
candidate_events = [
    "Réception d'envoi du client (Srt)",
    "Recevoir envoi au bureau d'échange (Ent)",
    "Recevoir envoi au bureau de livraison (Ent)",
    "Livraison d'envoi (Ent)"
]


In [11]:
result = []
for event in candidate_events:
    packages_with_event = df[df['EVENT_TYPE_NM'] == event]['MAILITM_FID'].nunique()
    percentage = packages_with_event / total_packages * 100
    result.append({
        "event": event,
        "unique_packages": packages_with_event,
        "percentage": percentage
    })

import pandas as pd
result_df = pd.DataFrame(result)


In [12]:
result_df

Unnamed: 0,event,unique_packages,percentage
0,Réception d'envoi du client (Srt),1443456,47.356237
1,Recevoir envoi au bureau d'échange (Ent),3035601,99.590595
2,Recevoir envoi au bureau de livraison (Ent),2980319,97.776928
3,Livraison d'envoi (Ent),2699706,88.570707


In [22]:
event_counts = (
    df.groupby('EVENT_TYPE_NM')['MAILITM_FID']
    .nunique()
    .reset_index(name='unique_packages')
)
event_counts['percentage'] = event_counts['unique_packages'] / total_packages * 100
print(event_counts)  # Print the DataFrame with event_counts


                                        EVENT_TYPE_NM  unique_packages  \
0   Enregistrer détails d'envoi au bureau d'échang...           286223   
1   Enregistrer informations douanières d'envoi (Ent)              888   
2   Enregistrer raison de rétention d'envoi par la...             1018   
3        Expédier envoi pour livraison physique (Ent)                1   
4            Expédier envoi à adresse nationale (Ent)          2967295   
5            Expédier envoi à adresse nationale (Srt)          2631488   
6                    Expédier envoi à la douane (Ent)            44991   
7                    Expédier envoi à la douane (Srt)              122   
8          Expédition d'envoi à l'étranger (EDI-reçu)          2531209   
9              Garder envoi au bureau d'échange (Ent)                1   
10           Garder envoi au point de livraison (Ent)                9   
11                       Insérer envoi dans sac (Srt)          2710436   
12                            Livraiso

In [17]:
print(df['EVENT_TYPE_NM'].unique())


["Réception d'envoi du client (Srt)"
 "Recevoir envoi au bureau d'échange (Ent)"
 'Expédier envoi à adresse nationale (Ent)'
 'Recevoir envoi au bureau de livraison (Ent)'
 "Vaine tentative de livraison d'envoi (Ent)" "Livraison d'envoi (Ent)"
 'Insérer envoi dans sac (Srt)' 'Expédier envoi à adresse nationale (Srt)'
 "Transmettre envoi à l'agent de livraison (Ent)"
 'Mettre à jour envoi (Ent)' 'Expédier envoi à la douane (Ent)'
 'Renvoyer envoi de la douane (Ent)'
 "Enregistrer détails d'envoi au bureau d'échange (Srt)"
 "Recevoir envoi au bureau d'échange (Srt)" 'Recevoir envoi au lieu (Ent)'
 "Expédition d'envoi à l'étranger (EDI-reçu)"
 'Renvoyer envoi de la douane (Srt)'
 "Enregistrer informations douanières d'envoi (Ent)"
 'Expédier envoi à la douane (Srt)'
 "Enregistrer raison de rétention d'envoi par la douane (Srt)"
 'Recevoir envoi au lieu (Srt)' 'Garder envoi au point de livraison (Ent)'
 'Supprimer envoi du sac (Srt)' 'Mettre à jour envoi (Srt)'
 'Expédier envoi pour livrai

In [18]:
print("Filtered unique packages:", df['MAILITM_FID'].nunique())


Filtered unique packages: 3048080


In [16]:
# Define your core 4 events
core_events = [
    "Recevoir envoi au bureau d'échange (Srt)",
    "Expédier envoi à adresse nationale (Srt)",
    "Expédier envoi à adresse nationale (Ent)",
    "Livraison d'envoi (Ent)"
]

# Pivot so each package is a row and columns are core events (True/False)
pivot = df[df['EVENT_TYPE_NM'].isin(core_events)].pivot_table(
    index='MAILITM_FID',
    columns='EVENT_TYPE_NM',
    values='date',  # or any column, just needs a non-null value
    aggfunc='first'
)

# Find packages with ALL 4 events (no NaNs in those columns)
has_all_events = pivot.dropna(subset=core_events)
num_packages_all_events = has_all_events.shape[0]

# Total number of unique packages
total_packages = df['MAILITM_FID'].nunique()

# Percentage
percent_all_events = num_packages_all_events / total_packages * 100

print(f"Number of packages with all 4 core events: {num_packages_all_events}")
print(f"Percentage: {percent_all_events:.2f}%")


Number of packages with all 4 core events: 3091
Percentage: 0.10%


In [19]:
for event in core_events:
    count = df[df['EVENT_TYPE_NM'] == event]['MAILITM_FID'].nunique()
    print(f"{event}: {count}")


Recevoir envoi au bureau d'échange (Srt): 97274
Expédier envoi à adresse nationale (Srt): 2631488
Expédier envoi à adresse nationale (Ent): 2967295
Livraison d'envoi (Ent): 2699706


In [20]:
ids1 = set(df[df['EVENT_TYPE_NM'] == core_events[0]]['MAILITM_FID'])
ids2 = set(df[df['EVENT_TYPE_NM'] == core_events[1]]['MAILITM_FID'])
ids3 = set(df[df['EVENT_TYPE_NM'] == core_events[2]]['MAILITM_FID'])
ids4 = set(df[df['EVENT_TYPE_NM'] == core_events[3]]['MAILITM_FID'])

intersection = ids1 & ids2 & ids3 & ids4

print(f"At least 1: {len(ids1 | ids2 | ids3 | ids4)}")
print(f"At least 2: {len((ids1 & ids2) | (ids1 & ids3) | (ids1 & ids4) | (ids2 & ids3) | (ids2 & ids4) | (ids3 & ids4))}")
print(f"All 4: {len(intersection)}")


At least 1: 3034436
At least 2: 2984773
All 4: 3091


In [21]:
ids1 = set(df[df['EVENT_TYPE_NM'] == "Expédier envoi à adresse nationale (Srt)"]['MAILITM_FID'])
ids2 = set(df[df['EVENT_TYPE_NM'] == "Expédier envoi à adresse nationale (Ent)"]['MAILITM_FID'])
ids3 = set(df[df['EVENT_TYPE_NM'] == "Livraison d'envoi (Ent)"]['MAILITM_FID'])
intersection_3 = ids1 & ids2 & ids3
print(f"All 3 big events: {len(intersection_3)}")
print(f"Percentage: {len(intersection_3) / 3048080 * 100:.2f}%")


All 3 big events: 2289362
Percentage: 75.11%


In [23]:
core_events = [
    "Recevoir envoi au bureau d'échange (Ent)",
    "Expédier envoi à adresse nationale (Ent)",
    "Recevoir envoi au bureau de livraison (Ent)",
    "Livraison d'envoi (Ent)"
]


In [24]:
ids0 = set(df[df['EVENT_TYPE_NM'] == core_events[0]]['MAILITM_FID'])
ids1 = set(df[df['EVENT_TYPE_NM'] == core_events[1]]['MAILITM_FID'])
ids2 = set(df[df['EVENT_TYPE_NM'] == core_events[2]]['MAILITM_FID'])
ids3 = set(df[df['EVENT_TYPE_NM'] == core_events[3]]['MAILITM_FID'])

intersection_4 = ids0 & ids1 & ids2 & ids3
print(f"Packages with all 4 events: {len(intersection_4)}")
print(f"Percentage: {len(intersection_4) / 3048080 * 100:.2f}%")


Packages with all 4 events: 2610302
Percentage: 85.64%


In [28]:
# Count occurrences of each (package, event) pair
dupes = (
    df.groupby(['MAILITM_FID', 'EVENT_TYPE_NM'])
    .size()
    .reset_index(name='count')
)

# Show all cases where count > 1
duplicates = dupes[dupes['count'] > 1]
print(duplicates)


            MAILITM_FID                                EVENT_TYPE_NM  count
0         CA000020800LY     Expédier envoi à adresse nationale (Ent)      2
1         CA000020800LY     Expédier envoi à adresse nationale (Srt)      2
3         CA000020800LY                 Insérer envoi dans sac (Srt)      2
6         CA000020800LY  Recevoir envoi au bureau de livraison (Ent)      2
18        CA000094303US     Expédier envoi à adresse nationale (Srt)      3
...                 ...                                          ...    ...
25041379  ua674489962ae   Expédition d'envoi à l'étranger (EDI-reçu)      4
25041380  ua674674235ae   Expédition d'envoi à l'étranger (EDI-reçu)      4
25041381  ua674848064ae   Expédition d'envoi à l'étranger (EDI-reçu)      4
25041382  ua675528778ae   Expédition d'envoi à l'étranger (EDI-reçu)      4
25041383  ua675652713ae   Expédition d'envoi à l'étranger (EDI-reçu)      4

[6725394 rows x 3 columns]


In [29]:
dupes_core = dupes[(dupes['EVENT_TYPE_NM'].isin(core_events)) & (dupes['count'] > 1)]

# See how many rows and show the first few
print(f"Number of (package, event) pairs with duplicates (core events only): {dupes_core.shape[0]}")
print(dupes_core.head())

Number of (package, event) pairs with duplicates (core events only): 4048934
      MAILITM_FID                                EVENT_TYPE_NM  count
0   CA000020800LY     Expédier envoi à adresse nationale (Ent)      2
6   CA000020800LY  Recevoir envoi au bureau de livraison (Ent)      2
23  CA000094303US  Recevoir envoi au bureau de livraison (Ent)      3
27  CA000102985US     Expédier envoi à adresse nationale (Ent)      3
42  CA000132868US  Recevoir envoi au bureau de livraison (Ent)      4


In [31]:
# Make sure 'date' is datetime
df['date'] = pd.to_datetime(df['date'])

# Keep only the first occurrence of each core event per package
df_core_clean = (
    df[df['EVENT_TYPE_NM'].isin(core_events)]
    .sort_values('date')
    .drop_duplicates(subset=['MAILITM_FID', 'EVENT_TYPE_NM'], keep='first')
)
print(f"Rows after cleaning: {df_core_clean.shape[0]}")
df_core_clean.head()



Rows after cleaning: 11682921


Unnamed: 0,MAILITM_FID,EVENT_TYPE_NM,date,établissement_postal,EVENT_TYPE_CD,next_établissement_postal,id,duration_to_next_step,total_duration
4632432,RK853511820FR,Expédier envoi à adresse nationale (Ent),2005-11-08 23:53:52,ORAN EL M-NAOUER,35,SIDI BELABES RP,FR,7101 days 13:59:08,7120 days 08:10:10
4152891,RK526739733FR,Expédier envoi à adresse nationale (Ent),2009-08-25 09:31:51,ALGER GARE,35,CONSTANTINE GARE,FR,1 days 05:14:34,5166 days 22:34:03
4153391,RK526812685FR,Expédier envoi à adresse nationale (Ent),2009-11-10 10:00:42,ALGER GARE,35,CONSTANTINE GARE,FR,1 days 11:05:46,4834 days 00:16:34
4162117,RK651819793FR,Expédier envoi à adresse nationale (Ent),2009-12-13 18:30:02,ORAN EL M-NAOUER,35,SAIDA RP,FR,4597 days 11:09:58,4612 days 18:14:57
77919,RB168186601SG,Recevoir envoi au bureau d'échange (Ent),2017-03-13 08:31:10,ALGER COLIS POSTAUX,30,,SG,0 days 23:16:12,1835 days 05:53:50


In [33]:
# List of packages+events with duplicates
dupe_packages = dupes_core[['MAILITM_FID', 'EVENT_TYPE_NM']]

# Merge to your original dataframe to get all the real duplicated rows (not just counts)
core_df = df[df['EVENT_TYPE_NM'].isin(core_events)]

# Inner join: get only the actual duplicate rows for these (package, event) pairs
dupe_rows = core_df.merge(dupe_packages, on=['MAILITM_FID', 'EVENT_TYPE_NM'])

# Sort for readability
dupe_rows = dupe_rows.sort_values(['MAILITM_FID', 'EVENT_TYPE_NM', 'date'])

# Show some samples
dupe_rows.head(50)  # or whatever number you want


Unnamed: 0,MAILITM_FID,EVENT_TYPE_NM,date,établissement_postal,EVENT_TYPE_CD,next_établissement_postal,id,duration_to_next_step,total_duration
8466100,CA000020800LY,Expédier envoi à adresse nationale (Ent),2022-02-22 09:52:03,ALGER COLIS POSTAUX,35,FER CPX ALGER,LY,0 days 00:42:49,24 days 23:11:23
8466101,CA000020800LY,Expédier envoi à adresse nationale (Ent),2022-02-22 10:34:52,ALGER COLIS POSTAUX,35,CONSTANTINE COLIS POSTAUX,LY,1 days 23:56:45,24 days 23:11:23
8466102,CA000020800LY,Recevoir envoi au bureau de livraison (Ent),2022-02-26 10:21:19,CDD OUM EL BOUAGHI,32,,LY,0 days 00:02:44,24 days 23:11:23
8466103,CA000020800LY,Recevoir envoi au bureau de livraison (Ent),2022-02-27 09:40:15,CDD AIN MILILA,32,,LY,0 days 00:00:35,24 days 23:11:23
8466104,CA000094303US,Recevoir envoi au bureau de livraison (Ent),2024-12-02 08:41:38,ANNABA EL MARSA,32,,US,0 days 00:52:43,26 days 21:55:12
8466105,CA000094303US,Recevoir envoi au bureau de livraison (Ent),2024-12-03 14:24:54,CDD SKIKDA,32,,US,0 days 00:08:31,26 days 21:55:12
8466106,CA000094303US,Recevoir envoi au bureau de livraison (Ent),2024-12-04 08:47:03,SKIKDA RP,32,,US,0 days 00:00:31,26 days 21:55:12
8466107,CA000102985US,Expédier envoi à adresse nationale (Ent),2024-12-10 08:00:35,ALGER COLIS POSTAUX,35,ALGER GARE,US,1 days 01:47:31,46 days 17:38:21
8466108,CA000102985US,Expédier envoi à adresse nationale (Ent),2024-12-11 09:48:06,ALGER GARE,35,CDD KOUBA,US,0 days 21:09:32,46 days 17:38:21
8466109,CA000102985US,Expédier envoi à adresse nationale (Ent),2024-12-12 06:59:15,CDD KOUBA,35,HAI EL BADR,US,2 days 02:43:07,46 days 17:38:21


In [34]:
import pandas as pd
import numpy as np

# Ensure datetime
df['date'] = pd.to_datetime(df['date'])

# Your four events
core_events = [
    "Recevoir envoi au bureau d'échange (Ent)",
    "Expédier envoi à adresse nationale (Ent)",
    "Recevoir envoi au bureau de livraison (Ent)",
    "Livraison d'envoi (Ent)"
]

thresholds = [30, 60, 300, 1440]  # in minutes (30min, 1hr, 5hr, 1day)
results = []

# Filter to core events only
df_core = df[df['EVENT_TYPE_NM'].isin(core_events)]

# Step 1: Duplicates as-is (ignoring location, time)
dupes_all = (
    df_core.groupby(['MAILITM_FID', 'EVENT_TYPE_NM'])
    .size()
    .reset_index(name='count')
)
step1_count = (dupes_all['count'] > 1).sum()

# Step 2: After location filter (same package, event, location)
dupes_loc = (
    df_core.groupby(['MAILITM_FID', 'EVENT_TYPE_NM', 'établissement_postal'])
    .size()
    .reset_index(name='count')
)
step2_count = (dupes_loc['count'] > 1).sum()

# Now for each threshold, do Step 3 and Step 4
for thresh in thresholds:
    step = {}
    step['threshold_min'] = thresh
    step['step1_as_is'] = step1_count
    step['step2_location'] = step2_count

    # Step 3: Time gap only (same package, event, location, but count pairs with any time gap < threshold)
    dupe_rows = df_core.copy()
    dupe_rows = dupe_rows.sort_values(['MAILITM_FID', 'EVENT_TYPE_NM', 'établissement_postal', 'date'])

    # Calculate time diff (in minutes) between consecutive same (package, event, location)
    dupe_rows['time_diff_min'] = dupe_rows.groupby(['MAILITM_FID', 'EVENT_TYPE_NM', 'établissement_postal'])['date'].diff().dt.total_seconds() / 60

    # Identify groups with at least one suspiciously short time diff
    timegap_flags = (
        dupe_rows[dupe_rows['time_diff_min'].notnull() & (dupe_rows['time_diff_min'] < thresh)]
        .groupby(['MAILITM_FID', 'EVENT_TYPE_NM', 'établissement_postal'])
        .size()
        .reset_index()
    )
    step['step3_timegap'] = timegap_flags.shape[0]

    # Step 4: Both filters (location AND time gap)
    # This is the same as step3, because we're already grouping by location above
    step['step4_both'] = timegap_flags.shape[0]

    results.append(step)

# Display as DataFrame
results_df = pd.DataFrame(results)
results_df


Unnamed: 0,threshold_min,step1_as_is,step2_location,step3_timegap,step4_both
0,30,4048934,1080171,276548,276548
1,60,4048934,1080171,418125,418125
2,300,4048934,1080171,508561,508561
3,1440,4048934,1080171,613722,613722


In [35]:
print(results_df)

   threshold_min  step1_as_is  step2_location  step3_timegap  step4_both
0             30      4048934         1080171         276548      276548
1             60      4048934         1080171         418125      418125
2            300      4048934         1080171         508561      508561
3           1440      4048934         1080171         613722      613722


In [36]:
# 1. Get only core events
df_core = df[df['EVENT_TYPE_NM'].isin(core_events)].copy()

# 2. Count unique locations for each (package, event) pair
loc_counts = (
    df_core.groupby(['MAILITM_FID', 'EVENT_TYPE_NM'])['établissement_postal']
    .nunique()
    .reset_index(name='location_count')
)

# 3. Find (package, event) pairs with repeats all at the same location AND more than one occurrence
dupes_same_loc = (
    df_core.groupby(['MAILITM_FID', 'EVENT_TYPE_NM', 'établissement_postal'])
    .size()
    .reset_index(name='repeat_count')
)

dupes_same_loc = dupes_same_loc[dupes_same_loc['repeat_count'] > 1]

# This is your analysis base: only repeats, always at the same location
print(f"Analysis base size: {len(dupes_same_loc)}")


Analysis base size: 1080171


In [37]:
repeat_dist = dupes_same_loc['repeat_count'].value_counts().sort_index()
print("Distribution of repeat counts per (package, event, location):")
print(repeat_dist)


Distribution of repeat counts per (package, event, location):
repeat_count
2      980129
3       75217
4       15129
5        4783
6        1966
7        1055
8         555
9         373
10        233
11        155
12        111
13         80
14         60
15         59
16         42
17         36
18         23
19         14
20         18
21         13
22         18
23         12
24          9
25          6
26          5
27          9
28          5
29          5
30          3
31          5
33          4
34          2
35          2
37          3
38          4
39          4
40          5
41          6
42          1
46          2
48          3
56          1
58          1
64          1
66          1
92          2
100         1
Name: count, dtype: int64


In [38]:
event_repeat = dupes_same_loc.groupby('EVENT_TYPE_NM').size().sort_values(ascending=False)
print("\nRepeat counts by core event type:")
print(event_repeat)



Repeat counts by core event type:
EVENT_TYPE_NM
Expédier envoi à adresse nationale (Ent)       561832
Recevoir envoi au bureau de livraison (Ent)    392156
Recevoir envoi au bureau d'échange (Ent)       100460
Livraison d'envoi (Ent)                         25723
dtype: int64


In [39]:
location_hotspots = dupes_same_loc.groupby('établissement_postal').size().sort_values(ascending=False).head(20)
print("\nTop 20 hotspot locations for core event repeats:")
print(location_hotspots)



Top 20 hotspot locations for core event repeats:
établissement_postal
ALGER COLIS POSTAUX           369377
ALGER GARE                    274909
CTR CHLEF                      23779
ANNABA EL MARSA                17406
CONSTANTINE COLIS POSTAUX      15109
CDD SETIF                      13106
CDD BATNA                      10112
ORAN EL M-NAOUER                8297
CDD JIJEL                       8072
CONSTANTINE GARE                7867
CDD TIZI-OUZOU                  7540
CDD BLIDA                       7467
CDD SKIKDA                      6257
CDD BEJAIA                      5761
CDD BOUMERDES FRANTZ FANON      5426
CDD DJELFA                      5415
CDD TLEMCEN                     5065
CDD MOSTAGHANEM                 4361
CTR BISKRA                      4331
CDD BBA                         4289
dtype: int64


In [40]:
# Join back to original core events to get dates for each repeat group
merge_cols = ['MAILITM_FID', 'EVENT_TYPE_NM', 'établissement_postal']
dupe_rows = df_core.merge(dupes_same_loc[merge_cols], on=merge_cols)

# Sort for proper diff
dupe_rows = dupe_rows.sort_values(merge_cols + ['date'])

# Calculate time difference between repeats (in minutes)
dupe_rows['time_diff_min'] = dupe_rows.groupby(merge_cols)['date'].diff().dt.total_seconds() / 60

# Exclude first rows (NaN) and describe time gaps
gap_stats = dupe_rows['time_diff_min'].dropna().describe(percentiles=[.25, .5, .75, .9, .95, .99])
print("\nTime gap stats between repeats (in minutes):")
print(gap_stats)



Time gap stats between repeats (in minutes):
count    1.231758e+06
mean     9.945929e+03
std      3.702896e+04
min      0.000000e+00
25%      2.923333e+01
50%      1.261242e+03
75%      5.745250e+03
90%      2.875127e+04
95%      4.784984e+04
99%      1.308874e+05
max      7.428861e+06
Name: time_diff_min, dtype: float64


In [42]:
# Get (package, event, location) with the most repeats
top_repeats = dupes_same_loc.sort_values('repeat_count', ascending=False).head(5)

print("\nTop 5 packages with most repeats (show full event sequence):")
for idx, row in top_repeats.iterrows():
    mask = (
        (df_core['MAILITM_FID'] == row['MAILITM_FID']) &
        (df_core['EVENT_TYPE_NM'] == row['EVENT_TYPE_NM']) &
        (df_core['établissement_postal'] == row['établissement_postal'])
    )
    print(f"\nPackage: {row['MAILITM_FID']} | Event: {row['EVENT_TYPE_NM']} | Location: {row['établissement_postal']}")
    print(df_core.loc[mask].sort_values('date')[['date', 'EVENT_TYPE_NM', 'établissement_postal']])



Top 5 packages with most repeats (show full event sequence):

Package: RB184660484SG | Event: Expédier envoi à adresse nationale (Ent) | Location: CDD KOUBA
                       date                             EVENT_TYPE_NM  \
1058835 2022-04-18 07:31:09  Expédier envoi à adresse nationale (Ent)   
1058836 2022-04-18 07:32:15  Expédier envoi à adresse nationale (Ent)   
1058837 2022-04-18 07:32:28  Expédier envoi à adresse nationale (Ent)   
1058838 2022-04-18 07:33:06  Expédier envoi à adresse nationale (Ent)   
1058839 2022-04-18 07:33:36  Expédier envoi à adresse nationale (Ent)   
...                     ...                                       ...   
1058930 2022-04-18 10:12:27  Expédier envoi à adresse nationale (Ent)   
1058931 2022-04-18 10:33:51  Expédier envoi à adresse nationale (Ent)   
1058932 2022-04-18 10:50:02  Expédier envoi à adresse nationale (Ent)   
1058933 2022-04-18 10:50:23  Expédier envoi à adresse nationale (Ent)   
1058934 2022-04-19 06:49:42  Expédier e

In [43]:
import pandas as pd

# Make sure your events and datetime are correct
core_events = [
    "Recevoir envoi au bureau d'échange (Ent)",
    "Expédier envoi à adresse nationale (Ent)",
    "Recevoir envoi au bureau de livraison (Ent)",
    "Livraison d'envoi (Ent)"
]
df['date'] = pd.to_datetime(df['date'])

# 1. Keep only core events
df_core = df[df['EVENT_TYPE_NM'].isin(core_events)].copy()

# 2. Identify packages with all 4 events
package_event_counts = df_core.groupby('MAILITM_FID')['EVENT_TYPE_NM'].nunique()
good_packages = package_event_counts[package_event_counts == 4].index
df_good = df_core[df_core['MAILITM_FID'].isin(good_packages)]

# 3. For each package, get first/last per event as specified
def get_occurrence(group):
    result = {}
    # First entry to exchange
    mask1 = group['EVENT_TYPE_NM'] == "Recevoir envoi au bureau d'échange (Ent)"
    result['exchange_entry'] = group[mask1]['date'].min()
    # Last dispatch from exchange
    mask2 = group['EVENT_TYPE_NM'] == "Expédier envoi à adresse nationale (Ent)"
    result['exchange_dispatch'] = group[mask2]['date'].max()
    # First entry to delivery office
    mask3 = group['EVENT_TYPE_NM'] == "Recevoir envoi au bureau de livraison (Ent)"
    result['delivery_office_entry'] = group[mask3]['date'].min()
    # First delivery
    mask4 = group['EVENT_TYPE_NM'] == "Livraison d'envoi (Ent)"
    result['delivered'] = group[mask4]['date'].min()
    return pd.Series(result)

summary = df_good.groupby('MAILITM_FID').apply(get_occurrence).reset_index()

# 4. Calculate durations (in days)
summary['exchange_to_dispatch_duration'] = (
    summary['exchange_dispatch'] - summary['exchange_entry']
).dt.total_seconds() / 86400

summary['dispatch_to_delivery_office_duration'] = (
    summary['delivery_office_entry'] - summary['exchange_dispatch']
).dt.total_seconds() / 86400

summary['delivery_office_to_delivered_duration'] = (
    summary['delivered'] - summary['delivery_office_entry']
).dt.total_seconds() / 86400

summary['total_calculated_duration'] = (
    summary['delivered'] - summary['exchange_entry']
).dt.total_seconds() / 86400

# 5. (Optional) Add total_duration for comparison, if it exists in your dataset
if 'total_duration' in df_good.columns:
    total_duration_map = df_good.drop_duplicates('MAILITM_FID').set_index('MAILITM_FID')['total_duration']
    summary['total_duration'] = summary['MAILITM_FID'].map(total_duration_map)

print(summary.head())


  summary = df_good.groupby('MAILITM_FID').apply(get_occurrence).reset_index()


     MAILITM_FID      exchange_entry   exchange_dispatch  \
0  CA000020800LY 2022-02-22 09:50:25 2022-02-22 10:34:52   
1  CA000086085US 2024-11-30 08:24:22 2024-11-30 11:13:22   
2  CA000094303US 2024-11-30 08:24:41 2024-11-30 10:39:45   
3  CA000132868US 2024-12-11 08:52:13 2024-12-11 10:05:37   
4  CA000138344US 2024-12-11 08:55:21 2024-12-11 10:05:25   

  delivery_office_entry           delivered  exchange_to_dispatch_duration  \
0   2022-02-26 10:21:19 2022-02-28 09:30:23                       0.030868   
1   2024-12-02 10:02:01 2024-12-11 08:08:36                       0.117361   
2   2024-12-02 08:41:38 2024-12-15 08:55:12                       0.093796   
3   2024-12-14 08:05:04 2024-12-21 12:28:58                       0.050972   
4   2024-12-14 08:04:53 2024-12-21 12:28:18                       0.048657   

   dispatch_to_delivery_office_duration  \
0                              3.990590   
1                              1.950451   
2                              1.917975  