In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
file_name = "df_count_1.csv"   

In [3]:
 
df = pd.read_csv(file_name , parse_dates=["date"])


In [4]:
df.shape

(567652, 9)

In [5]:
# Read the country reference file
df_countries = pd.read_csv("CT_COUNTRIES.csv", sep=";", header=None, names=["code", "lang", "name"])

country_dict = df_countries.set_index("code")["name"].to_dict()

In [6]:
# Define columns to ignore when checking duplicates
cols_ignore = ["RECPTCL_FID", "duration_to_next_step", "total_duration", "id"]
cols_check  = [c for c in df.columns if c not in cols_ignore]

# 1. Identify all duplicate rows based on cols_check
dup_mask = df.duplicated(subset=cols_check, keep=False)

# 2. Number of such duplicate rows
num_duplicates = dup_mask.sum()
print(f"Number of duplicate rows (ignoring specified columns): {num_duplicates}")




Number of duplicate rows (ignoring specified columns): 10


In [7]:
df = df.drop_duplicates(subset=cols_check, keep="first").reset_index(drop=True)

In [43]:
import pandas as pd

ID_COL    = "MAILITM_FID"
EVENT_COL = "EVENT_TYPE_NM"

# 1) pick the first row per parcel (rows are already in correct order)
first_events = (
    df.groupby(ID_COL, sort=False)[EVENT_COL]
      .first()                 # a Series: index = parcel ID, value = first event
)

# 2) count how many parcels have each first event
event_counts = (
    first_events.value_counts()   # Series: index = event, value = count
               .sort_values(ascending=False)
)

# 3) print the result
print("First-event frequencies:")
for event, count in event_counts.items():
    print(f"{event:40} {count:,}")


First-event frequencies:
Insérer envoi dans sac (Srt)             55,689
Réception d'envoi du client (Srt)        17,294
Expédition d'envoi à l'étranger (EDI-reçu) 6,999
Recevoir envoi au bureau d'échange (Ent) 306
Expédier envoi à adresse nationale (Ent) 18
Recevoir envoi au bureau de livraison (Ent) 9


In [44]:
missing_values = df.isnull().sum()
missing_columns = missing_values[missing_values > 0]
print(missing_columns)

RECPTCL_FID                  559542
établissement_postal          77170
next_établissement_postal    398673
duration_to_next_step         80257
dtype: int64


In [45]:
df = df.drop(columns=["RECPTCL_FID", "duration_to_next_step", "total_duration"])

In [46]:
missing_values = df.isnull().sum()
missing_columns = missing_values[missing_values > 0]
print(missing_columns)

établissement_postal          77170
next_établissement_postal    398673
dtype: int64


In [47]:

# Assuming 'df' is your DataFrame!

# Identify the rows with missing values
missing_rows = df[df.isnull().any(axis=1)]

# Print the total number of rows that have missing values
print("Total rows with missing values:", missing_rows.shape[0])




Total rows with missing values: 411157


In [48]:
# Analyze the missing patterns across rows:
# Create a binary pattern for each row where 1 indicates a missing value and 0 indicates non-missing.
missing_pattern = missing_rows.isnull().astype(int)
# Convert each pattern to a tuple to use as a key for groupby
missing_pattern_tuples = missing_pattern.apply(tuple, axis=1)
# Group by the missing pattern and count the number of rows for each pattern
pattern_summary = missing_rows.groupby(missing_pattern_tuples).size().sort_values(ascending=False)

print("\nSummary of missing patterns (tuple of 0's and 1's corresponding to missing values in each column):")
print(pattern_summary)




Summary of missing patterns (tuple of 0's and 1's corresponding to missing values in each column):
(0, 0, 0, 0, 0, 1, 0)    333987
(0, 0, 0, 1, 0, 1, 0)     64686
(0, 0, 0, 1, 0, 0, 0)     12484
dtype: int64


In [49]:

# Count the occurrences of each protocol
event_type = df['EVENT_TYPE_NM'].value_counts()

print(event_type)


EVENT_TYPE_NM
Recevoir envoi au bureau de livraison (Ent)                    112466
Expédier envoi à adresse nationale (Ent)                        89164
Expédier envoi à adresse nationale (Srt)                        73223
Insérer envoi dans sac (Srt)                                    72389
Recevoir envoi au bureau d'échange (Ent)                        58574
Livraison d'envoi (Ent)                                         50224
Vaine tentative de livraison d'envoi (Ent)                      38517
Transmettre envoi à l'agent de livraison (Ent)                  33823
Réception d'envoi du client (Srt)                               18965
Expédition d'envoi à l'étranger (EDI-reçu)                      18715
Enregistrer détails d'envoi au bureau d'échange (Srt)            9229
Expédier envoi à la douane (Ent)                                 2890
Renvoyer envoi de la douane (Ent)                                1583
Recevoir envoi au bureau d'échange (Srt)                          792
Renvoy

In [9]:

# Count the occurrences of each protocol
proto_counts = df['établissement_postal'].value_counts()

proto_counts


établissement_postal
ALGER COLIS POSTAUX            120334
ALGER GARE                      25902
CONSTANTINE COLIS POSTAUX       19431
ANNABA EL MARSA                 18552
CTR CHLEF                        9293
                                ...  
MANSOURA-NOUVELLE                   1
CHLEF-RP                            1
BENI-MESTER                         1
BIR-EL-DJIR Cdt GUERRAB Med         1
KHESSIBIA                           1
Name: count, Length: 2449, dtype: int64

In [51]:
# Count the occurrences of each protocol
proto_counts = df['next_établissement_postal'].value_counts()

print(proto_counts)



next_établissement_postal
ALGÉRIE                      18715
ALGER GARE                   18269
SECTION PAQUETS CPX ALGER    10072
CONSTANTINE COLIS POSTAUX    10053
ORAN COLIS POSTAUX            9183
                             ...  
TEBESSA-EZZOUHOUR                1
DEUX BASSINS                     1
NADORAH                          1
AGOUNI GUEGHRANE                 1
CHEGLIBI-MAKHLOUF                1
Name: count, Length: 2370, dtype: int64


In [52]:

# Count the occurrences of each protocol
proto_counts = df['id'].value_counts()

print(proto_counts)

s = proto_counts


id
SG    479567
MY     19426
AE     17563
FR     15626
CN      8465
       ...  
UY         1
SD         1
LY         1
IR         1
CO         1
Name: count, Length: 134, dtype: int64


In [53]:
# Map the Series index using the country_dict
s.index = s.index.map(lambda code: country_dict.get(code, code))  # Keep unmapped codes as-is
print(s)

id
SINGAPOUR                        479567
MALAISIE                          19426
ÉMIRATS ARABES UNIS               17563
FRANCE                            15626
CHINE                              8465
                                  ...  
URUGUAY                               1
SOUDAN                                1
LIBYENNE, JAMAHIRIYA ARABE            1
IRAN, RÉPUBLIQUE ISLAMIQUE D'         1
COLOMBIE                              1
Name: count, Length: 134, dtype: int64


In [10]:
print("Min date:", df["date"].min())
print("Max date:", df["date"].max())

Min date: 2017-03-13 08:31:10
Max date: 2025-05-21 15:21:05


In [59]:
df["date"] = df["date"].str.replace(r'\.\d+$', '', regex=True)
df["date"] = pd.to_datetime(df["date"], infer_datetime_format=True)

  df["date"] = pd.to_datetime(df["date"], infer_datetime_format=True)


In [42]:
df = df.sort_values(by=["MAILITM_FID", "date"])

# Calculate time to next step
df["duration_to_next_step"] = df.groupby("MAILITM_FID")["date"].shift(-1) - df["date"]

# Calculate total duration per ID
first_date = df.groupby("MAILITM_FID")["date"].transform("first")
last_date = df.groupby("MAILITM_FID")["date"].transform("last")
df["total_duration"] = last_date - first_date




In [31]:
# Preview the updated dataset
df[["MAILITM_FID", "date", "EVENT_TYPE_NM", "duration_to_next_step", "total_duration"]].head(10)

Unnamed: 0,MAILITM_FID,date,EVENT_TYPE_NM,duration_to_next_step,total_duration
0,CA000020844RU,2024-12-10 16:40:00,Réception d'envoi du client (Srt),NaT,0 days 00:00:00
1,CA000306674TG,2023-04-20 12:14:00,Réception d'envoi du client (Srt),NaT,0 days 00:00:00
2,CA000422162US,2025-02-03 15:10:00,Réception d'envoi du client (Srt),12 days 17:45:47,28 days 18:51:38
3,CA000422162US,2025-02-16 08:55:47,Recevoir envoi au bureau d'échange (Ent),0 days 01:29:54,28 days 18:51:38
4,CA000422162US,2025-02-16 10:25:41,Expédier envoi à adresse nationale (Ent),0 days 00:00:47,28 days 18:51:38
5,CA000422162US,2025-02-16 10:26:28,Expédier envoi à adresse nationale (Ent),1 days 00:24:37,28 days 18:51:38
6,CA000422162US,2025-02-17 10:51:05,Expédier envoi à adresse nationale (Ent),10 days 03:03:13,28 days 18:51:38
7,CA000422162US,2025-02-27 13:54:18,Recevoir envoi au bureau de livraison (Ent),0 days 00:47:27,28 days 18:51:38
8,CA000422162US,2025-02-27 14:41:45,Expédier envoi à adresse nationale (Ent),1 days 17:56:51,28 days 18:51:38
9,CA000422162US,2025-03-01 08:38:36,Recevoir envoi au bureau de livraison (Ent),0 days 00:01:37,28 days 18:51:38


In [43]:
# Filter rows where total_duration is NOT zero
zero_duration_df = df[df["total_duration"] == pd.Timedelta(days = 0)]

# Display the first few rows as a check
zero_duration_df.shape


(13258, 9)

In [44]:

# Count the occurrences of each protocol
countries = zero_duration_df['id'].value_counts()

print(countries)


id
ae    2850
SG    2773
DE    1054
CN     808
US     730
      ... 
NC       1
LI       1
PA       1
CO       1
LY       1
Name: count, Length: 113, dtype: int64


In [45]:
# Map the Series index using the country_dict
countries.index = countries.index.map(lambda code: country_dict.get(code, code))  
print(countries)

id
ae                            2850
SINGAPOUR                     2773
ALLEMAGNE                     1054
CHINE                          808
ÉTATS-UNIS                     730
                              ... 
NOUVELLE-CALÉDONIE               1
LIECHTENSTEIN                    1
PANAMA                           1
COLOMBIE                         1
LIBYENNE, JAMAHIRIYA ARABE       1
Name: count, Length: 113, dtype: int64


In [46]:
# Step 1: Count how many times each ID appears
id_counts = zero_duration_df["MAILITM_FID"].value_counts()

# Step 2: Count how many IDs have a specific count
count_of_counts = id_counts.value_counts().sort_index()

print(count_of_counts)


count
1    13210
2       24
Name: count, dtype: int64


In [47]:
ids_with_0 = id_counts[id_counts == 1].index
df_with_0 = df[df["MAILITM_FID"].isin(ids_with_0)]
df_with_0.head()

Unnamed: 0,MAILITM_FID,EVENT_TYPE_NM,date,établissement_postal,EVENT_TYPE_CD,next_établissement_postal,id,duration_to_next_step,total_duration
0,CA000020844RU,Réception d'envoi du client (Srt),2024-12-10 16:40:00,"RUSSIE, FÉDÉRATION DE",1,,RU,NaT,0 days
1,CA000306674TG,Réception d'envoi du client (Srt),2023-04-20 12:14:00,,1,,TG,NaT,0 days
23,CA001056995US,Réception d'envoi du client (Srt),2025-04-25 12:39:00,ÉTATS-UNIS,1,,US,NaT,0 days
24,CA001285128JE,Réception d'envoi du client (Srt),2024-10-08 23:37:00,,1,,JE,NaT,0 days
74,CA001498423PT,Insérer envoi dans sac (Srt),2023-05-05 13:04:00,PORTUGAL,8,,PT,NaT,0 days


In [48]:

# Count the occurrences of each protocol
event_type = df_with_0['EVENT_TYPE_NM'].value_counts()

print(event_type)



EVENT_TYPE_NM
Expédition d'envoi à l'étranger (EDI-reçu)    6575
Réception d'envoi du client (Srt)             3668
Insérer envoi dans sac (Srt)                  2967
Name: count, dtype: int64


In [49]:
# Drop rows where total_duration is zero
df = df[df["total_duration"] != pd.Timedelta(0)]

# Check the shape after dropping
print(df.shape)


(567652, 9)


In [50]:
df.to_csv(file_name, index=False, encoding='utf-8-sig')