In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
import pandas as pd
from pathlib import Path
import gc            # optional: force garbage-collection between files

# ── CONFIGURE THESE ──────────────────────────────────────────────
TEMPLATE     = "df_with_durations_0{}.csv"   # e.g. df_with_durations_01.csv
FILE_NUMBERS = range(1, 8)                   # 1 … 7   (adjust if yours are 0 … 6)
OUT_FILE     = Path("combined_eq0days.csv") # final result
DUR_COL      = "total_duration"              # column to filter
THRESHOLD    = pd.Timedelta(days=0)         # keep rows = 0
# ─────────────────────────────────────────────────────────────────

# start fresh (delete old result if it exists)
OUT_FILE.unlink(missing_ok=True)

for i in FILE_NUMBERS:
    src = TEMPLATE.format(i)
    print(f"→ reading {src}")

    # 1) read the whole file (or stream in chunks if the CSVs are huge)
    df = pd.read_csv(src)

    # 2) convert duration column to Timedelta
    df[DUR_COL] = pd.to_timedelta(df[DUR_COL], errors="coerce")

    # 3) filter
    df = df[df[DUR_COL] == THRESHOLD]

    # 4) append to the output CSV, write header only the first time
    df.to_csv(
        OUT_FILE,
        mode="a",
        header=not OUT_FILE.exists(),  # write header if file was just created
        index=False
    )

    # 5) drop references to free memory before next loop
    del df
    gc.collect()

print("✓ all files processed")
print(f"result saved to: {OUT_FILE.resolve()}")


→ reading df_with_durations_01.csv


  df = pd.read_csv(src)


→ reading df_with_durations_02.csv


  df = pd.read_csv(src)


→ reading df_with_durations_03.csv
→ reading df_with_durations_04.csv
→ reading df_with_durations_05.csv
→ reading df_with_durations_06.csv
→ reading df_with_durations_07.csv
✓ all files processed
result saved to: C:\Users\oussa\Desktop\alg_poste_stage\combined_eq0days.csv


In [4]:
df = pd.read_csv("combined_eq0days.csv", parse_dates=["date"])

  df = pd.read_csv("combined_eq0days.csv", parse_dates=["date"])


In [5]:
df.head()

Unnamed: 0,RECPTCL_FID,MAILITM_FID,EVENT_TYPE_NM,date,établissement_postal,EVENT_TYPE_CD,next_établissement_postal,id,duration_to_next_step,total_duration
0,,CA000020844RU,Réception d'envoi du client (Srt),2024-12-10 16:40:00.000,"RUSSIE, FÉDÉRATION DE",1,,RU,,0 days
1,,CA000306674TG,Réception d'envoi du client (Srt),2023-04-20 12:14:00.000,,1,,TG,,0 days
2,,CA000631038US,Insérer envoi dans sac (Srt),2025-03-12 21:28:00.000,ÉTATS-UNIS,8,,US,,0 days
3,,CA001003839US,Réception d'envoi du client (Srt),2025-04-18 15:34:00.000,ÉTATS-UNIS,1,,US,,0 days
4,,CA001056995US,Réception d'envoi du client (Srt),2025-04-25 12:39:00.000,ÉTATS-UNIS,1,,US,,0 days


In [6]:
df.shape

(2734246, 10)

In [7]:
distinct_values_count = df.nunique()

distinct_values_count = distinct_values_count.sort_values(ascending=True)

print(distinct_values_count)

total_duration                     1
duration_to_next_step              2
EVENT_TYPE_NM                     20
EVENT_TYPE_CD                     20
id                               140
next_établissement_postal       2084
établissement_postal            2630
RECPTCL_FID                    89295
date                          255246
MAILITM_FID                  1901592
dtype: int64


In [8]:

# Count the occurrences of each protocol
event_type = df['EVENT_TYPE_NM'].value_counts()

print(event_type)


EVENT_TYPE_NM
Expédition d'envoi à l'étranger (EDI-reçu)                     1359483
Insérer envoi dans sac (Srt)                                   1158336
Expédier envoi à adresse nationale (Ent)                         43108
Recevoir envoi au bureau d'échange (Ent)                         39628
Recevoir envoi au bureau de livraison (Ent)                      38088
Expédier envoi à adresse nationale (Srt)                         23529
Livraison d'envoi (Ent)                                          22318
Vaine tentative de livraison d'envoi (Ent)                       15356
Réception d'envoi du client (Srt)                                13818
Expédier envoi à la douane (Ent)                                  9184
Transmettre envoi à l'agent de livraison (Ent)                    8669
Recevoir envoi au bureau d'échange (Srt)                          1092
Renvoyer envoi de la douane (Ent)                                  817
Enregistrer détails d'envoi au bureau d'échange (Srt)          

In [9]:
print("Min date:", df["date"].min())
print("Max date:", df["date"].max())

Min date: 2021-04-05 11:08:35.447
Max date: 2025-05-21 14:31:28.140


In [9]:
# Define columns to ignore when checking duplicates
cols_ignore = ["RECPTCL_FID", "duration_to_next_step", "total_duration", "id"]
cols_check  = [c for c in df.columns if c not in cols_ignore]

# 1. Identify all duplicate rows based on cols_check
dup_mask = df.duplicated(subset=cols_check, keep=False)

# 2. Number of such duplicate rows
num_duplicates = dup_mask.sum()
print(f"Number of duplicate rows (ignoring specified columns): {num_duplicates}")




Number of duplicate rows (ignoring specified columns): 4087


In [11]:
df = df.drop_duplicates(subset=cols_check, keep="first").reset_index(drop=True)

In [63]:
enter_algeria_event = "Recevoir envoi au bureau d'échange (Ent)"
df_enter = df[df["EVENT_TYPE_NM"] == enter_algeria_event]

In [64]:
# 2. Total rows vs unique MAILITM_FID in this subset
total_rows = len(df_enter)
unique_fids_enter = df_enter["MAILITM_FID"].nunique()

In [65]:
print(f"Total 'enter Algeria' rows: {total_rows}")
print(f"Unique MAILITM_FID in subset: {unique_fids_enter}")
print(f"Are all rows unique per FID? {'Yes' if total_rows == unique_fids_enter else 'No'}")

Total 'enter Algeria' rows: 933293
Unique MAILITM_FID in subset: 886093
Are all rows unique per FID? No


In [69]:
durations_btw_events = df["duration_to_next_step"]
durations_btw_events = pd.to_timedelta(durations_btw_events, errors="coerce")
durations_btw_events.shape

(8752582,)

In [71]:
print("events smaller than 15 = " , (durations_btw_events <= pd.Timedelta(days=15)).sum())

events smaller than 15 =  6727179


In [72]:
# Step 1: Count how many times each ID appears
id_counts = df["MAILITM_FID"].value_counts()

# Step 2: Count how many IDs have a specific count
count_of_counts = id_counts.value_counts().sort_index()

print(count_of_counts)


count
2       24437
3       67603
4       99104
5      113187
6      112030
        ...  
143         1
147         1
152         1
160         1
251         1
Name: count, Length: 126, dtype: int64


In [76]:
ids_with_0 = id_counts[id_counts == 251].index
df_with_0 = df[df["MAILITM_FID"].isin(ids_with_0)]
df_with_0.head()


Unnamed: 0,RECPTCL_FID,MAILITM_FID,EVENT_TYPE_NM,date,établissement_postal,EVENT_TYPE_CD,next_établissement_postal,id,duration_to_next_step,total_duration
835189,,RB198581530SG,Recevoir envoi au bureau d'échange (Ent),2022-06-11 11:58:08.340,ALGER COLIS POSTAUX,30,,SG,0 days 00:06:59.143000,15 days 03:26:48.533000
835190,,RB198581530SG,Expédier envoi à adresse nationale (Ent),2022-06-11 12:05:07.483,ALGER COLIS POSTAUX,35,CDD SETIF,SG,0 days 00:33:50.457000,15 days 03:26:48.533000
835191,,RB198581530SG,Expédier envoi à adresse nationale (Ent),2022-06-11 12:38:57.940,ALGER COLIS POSTAUX,35,SECTION PAQUETS CPX ALGER,SG,1 days 19:29:27.980000,15 days 03:26:48.533000
835192,,RB198581530SG,Recevoir envoi au bureau de livraison (Ent),2022-06-13 08:08:25.920,CDD SETIF,32,,SG,0 days 04:39:41.930000,15 days 03:26:48.533000
835193,,RB198581530SG,Expédier envoi à adresse nationale (Srt),2022-06-13 12:48:07.850,CDD SETIF,2,BABOR,SG,4 days 22:23:10.027000,15 days 03:26:48.533000


In [77]:
dup_run = (
    df.groupby("MAILITM_FID")["EVENT_TYPE_NM"]
      .shift(-1)                    # look one row ahead
      .eq(df["EVENT_TYPE_NM"])      # True if same event repeats
)

# keep rows that are NOT duplicates-in-a-row (i.e., keep the last in each streak)
df = df[~dup_run].copy()

In [78]:
df.shape

(7618347, 10)

In [79]:
# Step 1: Count how many times each ID appears
id_counts = df["MAILITM_FID"].value_counts()

# Step 2: Count how many IDs have a specific count
count_of_counts = id_counts.value_counts().sort_index()

print(count_of_counts)


count
1       31494
2       24561
3       85251
4      120168
5      127738
        ...  
127         4
128         2
130         2
133         1
134         1
Name: count, Length: 103, dtype: int64


In [81]:
ids_with_0 = id_counts[id_counts == 133].index
df_with_0 = df[df["MAILITM_FID"].isin(ids_with_0)]
df_with_0.head()


Unnamed: 0,RECPTCL_FID,MAILITM_FID,EVENT_TYPE_NM,date,établissement_postal,EVENT_TYPE_CD,next_établissement_postal,id,duration_to_next_step,total_duration
466391,,RV586240137CN,Insérer envoi dans sac (Srt),2022-01-01 09:31:00,CHINE,8,,CN,0 days 18:34:00,206 days 18:34:00
466392,CNBJSADZALGBAUR20001006010052,RV586240137CN,Expédition d'envoi à l'étranger (EDI-reçu),2022-01-02 04:05:00,,12,ALGÉRIE,CN,2 days 06:08:00,206 days 18:34:00
466393,,RV586240137CN,Insérer envoi dans sac (Srt),2022-01-04 10:13:00,CHINE,8,,CN,0 days 17:52:00,206 days 18:34:00
466394,CNBJSADZALGBAUR20007001010052,RV586240137CN,Expédition d'envoi à l'étranger (EDI-reçu),2022-01-05 04:05:00,,12,ALGÉRIE,CN,2 days 05:33:00,206 days 18:34:00
466395,,RV586240137CN,Insérer envoi dans sac (Srt),2022-01-07 09:38:00,CHINE,8,,CN,1 days 18:27:00,206 days 18:34:00
