In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
import pandas as pd
from pathlib import Path
import gc            # optional: force garbage-collection between files

# ── CONFIGURE THESE ──────────────────────────────────────────────
TEMPLATE     = "df_with_durations_0{}.csv"   # e.g. df_with_durations_01.csv
FILE_NUMBERS = range(1, 8)                   # 1 … 7   (adjust if yours are 0 … 6)
OUT_FILE     = Path("combined_ge15days.csv") # final result
DUR_COL      = "total_duration"              # column to filter
THRESHOLD    = pd.Timedelta(days=15)         # keep rows ≥ 15 days
# ─────────────────────────────────────────────────────────────────

# start fresh (delete old result if it exists)
OUT_FILE.unlink(missing_ok=True)

for i in FILE_NUMBERS:
    src = TEMPLATE.format(i)
    print(f"→ reading {src}")

    # 1) read the whole file (or stream in chunks if the CSVs are huge)
    df = pd.read_csv(src)

    # 2) convert duration column to Timedelta
    df[DUR_COL] = pd.to_timedelta(df[DUR_COL], errors="coerce")

    # 3) filter
    df = df[df[DUR_COL] >= THRESHOLD]

    # 4) append to the output CSV, write header only the first time
    df.to_csv(
        OUT_FILE,
        mode="a",
        header=not OUT_FILE.exists(),  # write header if file was just created
        index=False
    )

    # 5) drop references to free memory before next loop
    del df
    gc.collect()

print("✓ all files processed")
print(f"result saved to: {OUT_FILE.resolve()}")


→ reading df_with_durations_01.csv


  df = pd.read_csv(src)


→ reading df_with_durations_02.csv


  df = pd.read_csv(src)


→ reading df_with_durations_03.csv
→ reading df_with_durations_04.csv
→ reading df_with_durations_05.csv
→ reading df_with_durations_06.csv
→ reading df_with_durations_07.csv
✓ all files processed
result saved to: C:\Users\oussa\Desktop\alg_poste_stage\combined_ge15days.csv


In [2]:
df = pd.read_csv("combined_ge15days.csv", parse_dates=["date"])

FileNotFoundError: [Errno 2] No such file or directory: 'combined_ge15days.csv'

In [3]:
df.head()

Unnamed: 0,RECPTCL_FID,MAILITM_FID,EVENT_TYPE_NM,date,établissement_postal,EVENT_TYPE_CD,next_établissement_postal,id,duration_to_next_step,total_duration
0,,CA000387426TG,Réception d'envoi du client (Srt),2023-12-11 14:33:00.000,TOGO,1,,TG,2 days 01:36:00,32 days 00:42:00
1,,CA000387426TG,Insérer envoi dans sac (Srt),2023-12-13 16:09:00.000,TOGO,8,,TG,7 days 16:41:00,32 days 00:42:00
2,,CA000387426TG,Insérer envoi dans sac (Srt),2023-12-21 08:50:00.000,TOGO,8,,TG,22 days 06:25:00,32 days 00:42:00
3,FRCDGADZALGBACN40007002100022,CA000387426TG,Expédition d'envoi à l'étranger (EDI-reçu),2024-01-12 15:15:00.000,TOGO,12,ALGÉRIE,TG,,32 days 00:42:00
4,,CA000422162US,Réception d'envoi du client (Srt),2025-02-03 15:10:00.000,ÉTATS-UNIS,1,,US,12 days 17:45:47.023000,28 days 18:51:38.630000


In [4]:
df.shape

(8901904, 10)

In [5]:
distinct_values_count = df.nunique()

distinct_values_count = distinct_values_count.sort_values(ascending=True)

print(distinct_values_count)

EVENT_TYPE_NM                     24
EVENT_TYPE_CD                     24
id                               132
next_établissement_postal       3630
établissement_postal            3821
RECPTCL_FID                    93357
MAILITM_FID                  1082575
total_duration               1217445
duration_to_next_step        5997372
date                         8228075
dtype: int64


In [6]:

# Count the occurrences of each protocol
event_type = df['EVENT_TYPE_NM'].value_counts()

print(event_type)


EVENT_TYPE_NM
Recevoir envoi au bureau de livraison (Ent)                    2003685
Expédier envoi à adresse nationale (Srt)                       1540194
Expédier envoi à adresse nationale (Ent)                       1500927
Recevoir envoi au bureau d'échange (Ent)                        949610
Vaine tentative de livraison d'envoi (Ent)                      876575
Livraison d'envoi (Ent)                                         847021
Transmettre envoi à l'agent de livraison (Ent)                  503378
Expédition d'envoi à l'étranger (EDI-reçu)                      231978
Insérer envoi dans sac (Srt)                                    212326
Recevoir envoi au bureau d'échange (Srt)                         88054
Enregistrer détails d'envoi au bureau d'échange (Srt)            71805
Expédier envoi à la douane (Ent)                                 27210
Réception d'envoi du client (Srt)                                25037
Renvoyer envoi de la douane (Ent)                              

In [10]:
print("Min date:", df["date"].min())
print("Max date:", df["date"].max())

Min date: 2005-11-08 23:29:54.370
Max date: 2025-05-21 16:13:21.467


In [7]:
durations = df.groupby("MAILITM_FID")["total_duration"].first()
durations = pd.to_timedelta(durations, errors="coerce")
durations.shape

(1082575,)

In [8]:
print("packages btw 15 and 30 = " , (durations <= pd.Timedelta(days=30)).sum())

packages btw 15 and 30 =  728342


In [9]:
print("packages btw 30 and 45 = " , -1*( (durations <= pd.Timedelta(days=30)).sum() - (durations <= pd.Timedelta(days=45)).sum()))

packages btw 30 and 45 =  175047


In [59]:
print("packages btw 45 and 60 = " ,  -(durations <= pd.Timedelta(days=45)).sum() + (durations <= pd.Timedelta(days=60)).sum())

packages btw 45 and 60 =  66236


In [60]:
print("packages btw 60 and 2000 = " ,  -(durations <= pd.Timedelta(days=60)).sum() + (durations <= pd.Timedelta(days=2000)).sum())

packages btw 60 and 2000 =  112946


In [58]:
print("packages biggar 100 = " ,  (durations >= pd.Timedelta(days=100)).sum())

packages biggar 100 =  47851


In [10]:
# Define columns to ignore when checking duplicates
cols_ignore = ["RECPTCL_FID", "duration_to_next_step", "total_duration", "id"]
cols_check  = [c for c in df.columns if c not in cols_ignore]

# 1. Identify all duplicate rows based on cols_check
dup_mask = df.duplicated(subset=cols_check, keep=False)

# 2. Number of such duplicate rows
num_duplicates = dup_mask.sum()
print(f"Number of duplicate rows (ignoring specified columns): {num_duplicates}")




Number of duplicate rows (ignoring specified columns): 298612


In [11]:
df = df.drop_duplicates(subset=cols_check, keep="first").reset_index(drop=True)

In [63]:
enter_algeria_event = "Recevoir envoi au bureau d'échange (Ent)"
df_enter = df[df["EVENT_TYPE_NM"] == enter_algeria_event]

In [64]:
# 2. Total rows vs unique MAILITM_FID in this subset
total_rows = len(df_enter)
unique_fids_enter = df_enter["MAILITM_FID"].nunique()

In [65]:
print(f"Total 'enter Algeria' rows: {total_rows}")
print(f"Unique MAILITM_FID in subset: {unique_fids_enter}")
print(f"Are all rows unique per FID? {'Yes' if total_rows == unique_fids_enter else 'No'}")

Total 'enter Algeria' rows: 933293
Unique MAILITM_FID in subset: 886093
Are all rows unique per FID? No


In [69]:
durations_btw_events = df["duration_to_next_step"]
durations_btw_events = pd.to_timedelta(durations_btw_events, errors="coerce")
durations_btw_events.shape

(8752582,)

In [71]:
print("events smaller than 15 = " , (durations_btw_events <= pd.Timedelta(days=15)).sum())

events smaller than 15 =  6727179


In [72]:
# Step 1: Count how many times each ID appears
id_counts = df["MAILITM_FID"].value_counts()

# Step 2: Count how many IDs have a specific count
count_of_counts = id_counts.value_counts().sort_index()

print(count_of_counts)


count
2       24437
3       67603
4       99104
5      113187
6      112030
        ...  
143         1
147         1
152         1
160         1
251         1
Name: count, Length: 126, dtype: int64


In [76]:
ids_with_0 = id_counts[id_counts == 251].index
df_with_0 = df[df["MAILITM_FID"].isin(ids_with_0)]
df_with_0.head()


Unnamed: 0,RECPTCL_FID,MAILITM_FID,EVENT_TYPE_NM,date,établissement_postal,EVENT_TYPE_CD,next_établissement_postal,id,duration_to_next_step,total_duration
835189,,RB198581530SG,Recevoir envoi au bureau d'échange (Ent),2022-06-11 11:58:08.340,ALGER COLIS POSTAUX,30,,SG,0 days 00:06:59.143000,15 days 03:26:48.533000
835190,,RB198581530SG,Expédier envoi à adresse nationale (Ent),2022-06-11 12:05:07.483,ALGER COLIS POSTAUX,35,CDD SETIF,SG,0 days 00:33:50.457000,15 days 03:26:48.533000
835191,,RB198581530SG,Expédier envoi à adresse nationale (Ent),2022-06-11 12:38:57.940,ALGER COLIS POSTAUX,35,SECTION PAQUETS CPX ALGER,SG,1 days 19:29:27.980000,15 days 03:26:48.533000
835192,,RB198581530SG,Recevoir envoi au bureau de livraison (Ent),2022-06-13 08:08:25.920,CDD SETIF,32,,SG,0 days 04:39:41.930000,15 days 03:26:48.533000
835193,,RB198581530SG,Expédier envoi à adresse nationale (Srt),2022-06-13 12:48:07.850,CDD SETIF,2,BABOR,SG,4 days 22:23:10.027000,15 days 03:26:48.533000


In [77]:
dup_run = (
    df.groupby("MAILITM_FID")["EVENT_TYPE_NM"]
      .shift(-1)                    # look one row ahead
      .eq(df["EVENT_TYPE_NM"])      # True if same event repeats
)

# keep rows that are NOT duplicates-in-a-row (i.e., keep the last in each streak)
df = df[~dup_run].copy()

In [78]:
df.shape

(7618347, 10)

In [79]:
# Step 1: Count how many times each ID appears
id_counts = df["MAILITM_FID"].value_counts()

# Step 2: Count how many IDs have a specific count
count_of_counts = id_counts.value_counts().sort_index()

print(count_of_counts)


count
1       31494
2       24561
3       85251
4      120168
5      127738
        ...  
127         4
128         2
130         2
133         1
134         1
Name: count, Length: 103, dtype: int64


In [81]:
ids_with_0 = id_counts[id_counts == 133].index
df_with_0 = df[df["MAILITM_FID"].isin(ids_with_0)]
df_with_0.head()


Unnamed: 0,RECPTCL_FID,MAILITM_FID,EVENT_TYPE_NM,date,établissement_postal,EVENT_TYPE_CD,next_établissement_postal,id,duration_to_next_step,total_duration
466391,,RV586240137CN,Insérer envoi dans sac (Srt),2022-01-01 09:31:00,CHINE,8,,CN,0 days 18:34:00,206 days 18:34:00
466392,CNBJSADZALGBAUR20001006010052,RV586240137CN,Expédition d'envoi à l'étranger (EDI-reçu),2022-01-02 04:05:00,,12,ALGÉRIE,CN,2 days 06:08:00,206 days 18:34:00
466393,,RV586240137CN,Insérer envoi dans sac (Srt),2022-01-04 10:13:00,CHINE,8,,CN,0 days 17:52:00,206 days 18:34:00
466394,CNBJSADZALGBAUR20007001010052,RV586240137CN,Expédition d'envoi à l'étranger (EDI-reçu),2022-01-05 04:05:00,,12,ALGÉRIE,CN,2 days 05:33:00,206 days 18:34:00
466395,,RV586240137CN,Insérer envoi dans sac (Srt),2022-01-07 09:38:00,CHINE,8,,CN,1 days 18:27:00,206 days 18:34:00
