In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
import pandas as pd
from pathlib import Path
import gc            # optional: force garbage-collection between files

# ── CONFIGURE THESE ──────────────────────────────────────────────
FILES = [
    "df_count_1.csv", 
    "df_count_2_part1.csv", 
    "df_count_2_part2.csv", 
    "df_count_3_part1.csv", 
    "df_count_3_part2.csv", 
    "df_count_4.csv"
]  # List of new chunk files
OUT_FILE = Path("combined_ge15days_official.csv")  # final result
DUR_COL  = "total_duration"  # column to filter
THRESHOLD = pd.Timedelta(days=15)  # keep rows ≥ 15 days
# ─────────────────────────────────────────────────────────────────

# Start fresh (delete old result if it exists)
OUT_FILE.unlink(missing_ok=True)

for src in FILES:
    print(f"→ reading {src}")

    # 1) Read the whole file (or stream in chunks if the CSVs are huge)
    df = pd.read_csv(src)

    # 2) Convert duration column to Timedelta
    df[DUR_COL] = pd.to_timedelta(df[DUR_COL], errors="coerce")

    # 3) Filter the rows where total_duration >= 15 days
    df = df[df[DUR_COL] >= THRESHOLD]

    # 4) Append to the output CSV, write header only the first time
    df.to_csv(
        OUT_FILE,
        mode="a",
        header=not OUT_FILE.exists(),  # Write header if file was just created
        index=False
    )

    # 5) Drop references to free memory before next loop
    del df
    gc.collect()

print("✓ All files processed")
print(f"Result saved to: {OUT_FILE.resolve()}")


→ reading df_count_1.csv
→ reading df_count_2_part1.csv
→ reading df_count_2_part2.csv
→ reading df_count_3_part1.csv
→ reading df_count_3_part2.csv
→ reading df_count_4.csv
✓ All files processed
Result saved to: C:\Users\oussa\Desktop\alg_poste_stage\combined_ge15days_official.csv


In [3]:
df = pd.read_csv("combined_ge15days_official.csv", parse_dates=["date"])

In [3]:
df.head()

Unnamed: 0,MAILITM_FID,EVENT_TYPE_NM,date,établissement_postal,EVENT_TYPE_CD,next_établissement_postal,id,duration_to_next_step,total_duration
0,CA000422162US,Réception d'envoi du client (Srt),2025-02-03 15:10:00,ÉTATS-UNIS,1,,US,12 days 17:45:47,28 days 18:51:38
1,CA000422162US,Recevoir envoi au bureau d'échange (Ent),2025-02-16 08:55:47,ALGER COLIS POSTAUX,30,,US,0 days 01:29:54,28 days 18:51:38
2,CA000422162US,Expédier envoi à adresse nationale (Ent),2025-02-16 10:25:41,ALGER COLIS POSTAUX,35,ANNABA EL MARSA,US,0 days 00:00:47,28 days 18:51:38
3,CA000422162US,Expédier envoi à adresse nationale (Ent),2025-02-16 10:26:28,ALGER COLIS POSTAUX,35,AVION CPX ALGER,US,1 days 00:24:37,28 days 18:51:38
4,CA000422162US,Expédier envoi à adresse nationale (Ent),2025-02-17 10:51:05,ALGER COLIS POSTAUX,35,CTR BECHAR,US,10 days 03:03:13,28 days 18:51:38


In [4]:
df.shape

(34611297, 9)

In [5]:
distinct_values_count = df.nunique()

distinct_values_count = distinct_values_count.sort_values(ascending=True)

print(distinct_values_count)

EVENT_TYPE_NM                      26
EVENT_TYPE_CD                      26
id                                143
next_établissement_postal        3796
établissement_postal             3970
total_duration                1778999
duration_to_next_step         1928513
MAILITM_FID                   3048080
date                         15695109
dtype: int64


In [6]:

# Count the occurrences of each protocol
event_type = df['EVENT_TYPE_NM'].value_counts()

print(event_type)


EVENT_TYPE_NM
Recevoir envoi au bureau de livraison (Ent)                    6703810
Expédier envoi à adresse nationale (Ent)                       4977810
Expédier envoi à adresse nationale (Srt)                       4574729
Recevoir envoi au bureau d'échange (Ent)                       3171717
Insérer envoi dans sac (Srt)                                   2885480
Vaine tentative de livraison d'envoi (Ent)                     2879557
Livraison d'envoi (Ent)                                        2728635
Expédition d'envoi à l'étranger (EDI-reçu)                     2723907
Transmettre envoi à l'agent de livraison (Ent)                 1752347
Réception d'envoi du client (Srt)                              1741234
Enregistrer détails d'envoi au bureau d'échange (Srt)           290829
Recevoir envoi au bureau d'échange (Srt)                        100261
Expédier envoi à la douane (Ent)                                 45737
Renvoyer envoi de la douane (Ent)                              

In [7]:
print("Min date:", df["date"].min())
print("Max date:", df["date"].max())

Min date: 2005-11-08 23:29:54
Max date: 2025-05-21 16:13:21


In [8]:
durations = df.groupby("MAILITM_FID")["total_duration"].first()
durations = pd.to_timedelta(durations, errors="coerce")
durations.shape

(3048080,)

In [9]:
print("packages btw 15 and 30 = " , (durations <= pd.Timedelta(days=30)).sum())

packages btw 15 and 30 =  1928382


In [10]:
print("packages btw 30 and 45 = " , -1*( (durations <= pd.Timedelta(days=30)).sum() - (durations <= pd.Timedelta(days=45)).sum()))

packages btw 30 and 45 =  588245


In [11]:
print("packages btw 45 and 60 = " ,  -(durations <= pd.Timedelta(days=45)).sum() + (durations <= pd.Timedelta(days=60)).sum())

packages btw 45 and 60 =  237984


In [12]:
print("packages btw 60 and 2000 = " ,  -(durations <= pd.Timedelta(days=60)).sum() + (durations <= pd.Timedelta(days=2000)).sum())

packages btw 60 and 2000 =  293464


In [None]:
print("packages bigger 100 = " ,  (durations >= pd.Timedelta(days=100)).sum())

packages biggar 100 =  77694


In [4]:
# Define columns to ignore when checking duplicates
cols_ignore = ["RECPTCL_FID", "duration_to_next_step", "total_duration", "id"]
cols_check  = [c for c in df.columns if c not in cols_ignore]

# 1. Identify all duplicate rows based on cols_check
dup_mask = df.duplicated(subset=cols_check, keep=False)

# 2. Number of such duplicate rows
num_duplicates = dup_mask.sum()
print(f"Number of duplicate rows (ignoring specified columns): {num_duplicates}")




Number of duplicate rows (ignoring specified columns): 1256


In [5]:
df = df.drop_duplicates(subset=cols_check, keep="first").reset_index(drop=True)

In [None]:
enter_algeria_event = "Recevoir envoi au bureau d'échange (Ent)" 
df_enter = df[df["EVENT_TYPE_NM"] == enter_algeria_event]

In [7]:
# 2. Total rows vs unique MAILITM_FID in this subset
total_rows = len(df_enter)
unique_fids_enter = df_enter["MAILITM_FID"].nunique()

In [8]:
print(f"Total 'enter Algeria' rows: {total_rows}")
print(f"Unique MAILITM_FID in subset: {unique_fids_enter}")
print(f"Are all rows unique per FID? {'Yes' if total_rows == unique_fids_enter else 'No'}")

Total 'enter Algeria' rows: 3171704
Unique MAILITM_FID in subset: 3035601
Are all rows unique per FID? No


In [9]:
durations_btw_events = df["duration_to_next_step"]
durations_btw_events = pd.to_timedelta(durations_btw_events, errors="coerce")
durations_btw_events.shape

(34610669,)

In [10]:
print("events smaller than 15 = " , (durations_btw_events <= pd.Timedelta(days=15)).sum())

events smaller than 15 =  30270767


In [18]:
# Step 1: Count how many times each ID appears
id_counts = df["MAILITM_FID"].value_counts()

# Step 2: Count how many IDs have a specific count
count_of_counts = id_counts.value_counts().sort_index()

print(count_of_counts)


count
1        171
2        604
3       3946
4      13336
5      34884
6     179651
7     552237
8     976507
9     851384
10    409150
11     23860
12      2187
13       160
14         3
Name: count, dtype: int64


In [12]:
ids_with_0 = id_counts[id_counts == 252].index
df_with_0 = df[df["MAILITM_FID"].isin(ids_with_0)]
df_with_0.head()


Unnamed: 0,MAILITM_FID,EVENT_TYPE_NM,date,établissement_postal,EVENT_TYPE_CD,next_établissement_postal,id,duration_to_next_step,total_duration
1597796,RB198581530SG,Insérer envoi dans sac (Srt),2022-06-01 04:01:00,,8,,SG,10 days 07:57:08,25 days 11:23:56
1597797,RB198581530SG,Recevoir envoi au bureau d'échange (Ent),2022-06-11 11:58:08,ALGER COLIS POSTAUX,30,,SG,0 days 00:06:59,25 days 11:23:56
1597798,RB198581530SG,Expédier envoi à adresse nationale (Ent),2022-06-11 12:05:07,ALGER COLIS POSTAUX,35,CDD SETIF,SG,0 days 00:33:50,25 days 11:23:56
1597799,RB198581530SG,Expédier envoi à adresse nationale (Ent),2022-06-11 12:38:57,ALGER COLIS POSTAUX,35,SECTION PAQUETS CPX ALGER,SG,1 days 19:29:28,25 days 11:23:56
1597800,RB198581530SG,Recevoir envoi au bureau de livraison (Ent),2022-06-13 08:08:25,CDD SETIF,32,,SG,0 days 04:39:42,25 days 11:23:56


In [13]:
dup_run = (
    df.groupby("MAILITM_FID")["EVENT_TYPE_NM"]
      .shift(-1)                    # look one row ahead
      .eq(df["EVENT_TYPE_NM"])      # True if same event repeats
)

# keep rows t
# hat are NOT duplicates-in-a-row (i.e., keep the last in each streak)
df = df[~dup_run].copy()

In [14]:
df.shape

(31369774, 9)

In [15]:
# Step 1: Count how many times each ID appears
id_counts = df["MAILITM_FID"].value_counts()

# Step 2: Count how many IDs have a specific count
count_of_counts = id_counts.value_counts().sort_index()

print(count_of_counts)


count
1        171
2        180
3       3673
4      10986
5      17879
       ...  
129        1
130        2
131        1
133        1
134        1
Name: count, Length: 109, dtype: int64


In [16]:
ids_with_0 = id_counts[id_counts == 134].index
df_with_0 = df[df["MAILITM_FID"].isin(ids_with_0)]
df_with_0.head()


Unnamed: 0,MAILITM_FID,EVENT_TYPE_NM,date,établissement_postal,EVENT_TYPE_CD,next_établissement_postal,id,duration_to_next_step,total_duration
4767599,RQ762883509CN,Insérer envoi dans sac (Srt),2022-01-01 09:31:00,CHINE,8,,CN,0 days 18:34:00,346 days 03:21:03
4767600,RQ762883509CN,Expédition d'envoi à l'étranger (EDI-reçu),2022-01-02 04:05:00,,12,ALGÉRIE,CN,2 days 06:08:00,346 days 03:21:03
4767601,RQ762883509CN,Insérer envoi dans sac (Srt),2022-01-04 10:13:00,CHINE,8,,CN,0 days 17:52:00,346 days 03:21:03
4767602,RQ762883509CN,Expédition d'envoi à l'étranger (EDI-reçu),2022-01-05 04:05:00,,12,ALGÉRIE,CN,2 days 05:33:00,346 days 03:21:03
4767603,RQ762883509CN,Insérer envoi dans sac (Srt),2022-01-07 09:38:00,CHINE,8,,CN,1 days 18:27:00,346 days 03:21:03


In [17]:
# For each MAILITM_FID, keep only the last occurrence of each unique event
df = df.drop_duplicates(subset=["MAILITM_FID", "EVENT_TYPE_NM"], keep="last")



In [34]:
df.shape

(25041384, 9)

In [35]:
# Step 1: Count how many times each ID appears
id_counts = df["MAILITM_FID"].value_counts()

# Step 2: Count how many IDs have a specific count
count_of_counts = id_counts.value_counts().sort_index()

print(count_of_counts)


count
1        171
2        604
3       3946
4      13336
5      34884
6     179651
7     552237
8     976507
9     851384
10    409150
11     23860
12      2187
13       160
14         3
Name: count, dtype: int64


In [36]:
ids_with_0 = id_counts[id_counts == 14].index
df_with_0 = df[df["MAILITM_FID"].isin(ids_with_0)]
df_with_0.head()


Unnamed: 0,MAILITM_FID,EVENT_TYPE_NM,date,établissement_postal,EVENT_TYPE_CD,next_établissement_postal,id,duration_to_next_step,total_duration
13304388,RB276200461SG,Insérer envoi dans sac (Srt),2023-05-07 19:06:00,,8,,SG,0 days 17:09:00,27 days 11:52:02
13304389,RB276200461SG,Expédition d'envoi à l'étranger (EDI-reçu),2023-05-08 12:15:00,,12,ALGÉRIE,SG,5 days 20:48:13,27 days 11:52:02
13304390,RB276200461SG,Recevoir envoi au bureau d'échange (Ent),2023-05-14 09:03:13,ALGER GARE,30,,SG,0 days 01:02:13,27 days 11:52:02
13304392,RB276200461SG,Expédier envoi à adresse nationale (Ent),2023-05-14 10:26:48,ALGER COLIS POSTAUX,35,CONSTANTINE COLIS POSTAUX,SG,8 days 02:46:04,27 days 11:52:02
13304394,RB276200461SG,Enregistrer détails d'envoi au bureau d'échang...,2023-05-22 13:14:17,CONSTANTINE COLIS POSTAUX,11,,SG,1 days 00:37:55,27 days 11:52:02
