In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [1]:
file_name = "df_count_2_part1.csv"   

In [3]:
 
df = pd.read_csv(file_name , parse_dates=["date"])


In [4]:
df.shape

(8700873, 9)

In [5]:
# Read the country reference file
df_countries = pd.read_csv("CT_COUNTRIES.csv", sep=";", header=None, names=["code", "lang", "name"])

country_dict = df_countries.set_index("code")["name"].to_dict()

In [6]:
# Define columns to ignore when checking duplicates
cols_ignore = ["RECPTCL_FID", "duration_to_next_step", "total_duration", "id"]
cols_check  = [c for c in df.columns if c not in cols_ignore]

# 1. Identify all duplicate rows based on cols_check
dup_mask = df.duplicated(subset=cols_check, keep=False)

# 2. Number of such duplicate rows
num_duplicates = dup_mask.sum()
print(f"Number of duplicate rows (ignoring specified columns): {num_duplicates}")




Number of duplicate rows (ignoring specified columns): 136


In [7]:
df = df.drop_duplicates(subset=cols_check, keep="first").reset_index(drop=True)

In [8]:
import pandas as pd

ID_COL    = "MAILITM_FID"
EVENT_COL = "EVENT_TYPE_NM"

# 1) pick the first row per parcel (rows are already in correct order)
first_events = (
    df.groupby(ID_COL, sort=False)[EVENT_COL]
      .first()                 # a Series: index = parcel ID, value = first event
)

# 2) count how many parcels have each first event
event_counts = (
    first_events.value_counts()   # Series: index = event, value = count
               .sort_values(ascending=False)
)

# 3) print the result
print("First-event frequencies:")
for event, count in event_counts.items():
    print(f"{event:40} {count:,}")


First-event frequencies:
Insérer envoi dans sac (Srt)             489,835
Réception d'envoi du client (Srt)        294,069
Expédition d'envoi à l'étranger (EDI-reçu) 44,331
Recevoir envoi au bureau d'échange (Ent) 176
Recevoir envoi au bureau de livraison (Ent) 8
Expédier envoi à adresse nationale (Ent) 4
Recevoir envoi au bureau d'échange (Srt) 3
Recevoir envoi au lieu (Ent)             3


In [9]:
missing_values = df.isnull().sum()
missing_columns = missing_values[missing_values > 0]
print(missing_columns)

établissement_postal          639213
next_établissement_postal    5678279
duration_to_next_step         828428
dtype: int64


In [11]:
df = df.drop(columns=[ "duration_to_next_step", "total_duration"])

In [12]:
missing_values = df.isnull().sum()
missing_columns = missing_values[missing_values > 0]
print(missing_columns)

établissement_postal          639213
next_établissement_postal    5678279
dtype: int64


In [13]:

# Assuming 'df' is your DataFrame!

# Identify the rows with missing values
missing_rows = df[df.isnull().any(axis=1)]

# Print the total number of rows that have missing values
print("Total rows with missing values:", missing_rows.shape[0])




Total rows with missing values: 5827547


In [14]:
# Analyze the missing patterns across rows:
# Create a binary pattern for each row where 1 indicates a missing value and 0 indicates non-missing.
missing_pattern = missing_rows.isnull().astype(int)
# Convert each pattern to a tuple to use as a key for groupby
missing_pattern_tuples = missing_pattern.apply(tuple, axis=1)
# Group by the missing pattern and count the number of rows for each pattern
pattern_summary = missing_rows.groupby(missing_pattern_tuples).size().sort_values(ascending=False)

print("\nSummary of missing patterns (tuple of 0's and 1's corresponding to missing values in each column):")
print(pattern_summary)




Summary of missing patterns (tuple of 0's and 1's corresponding to missing values in each column):
(0, 0, 0, 0, 0, 1, 0)    5188334
(0, 0, 0, 1, 0, 1, 0)     489945
(0, 0, 0, 1, 0, 0, 0)     149268
dtype: int64


In [16]:
distinct_values_count = df.nunique()

distinct_values_count = distinct_values_count.sort_values(ascending=True)

print(distinct_values_count)

EVENT_TYPE_NM                     23
EVENT_TYPE_CD                     23
id                               125
next_établissement_postal       3547
établissement_postal            3704
MAILITM_FID                   828429
date                         5532970
dtype: int64


In [15]:

# Count the occurrences of each protocol
event_type = df['EVENT_TYPE_NM'].value_counts()

print(event_type)


EVENT_TYPE_NM
Recevoir envoi au bureau de livraison (Ent)                    1643829
Expédier envoi à adresse nationale (Ent)                       1511687
Expédier envoi à adresse nationale (Srt)                       1078770
Recevoir envoi au bureau d'échange (Ent)                        880417
Insérer envoi dans sac (Srt)                                    832234
Livraison d'envoi (Ent)                                         718600
Vaine tentative de livraison d'envoi (Ent)                      641377
Transmettre envoi à l'agent de livraison (Ent)                  526903
Expédition d'envoi à l'étranger (EDI-reçu)                      394132
Réception d'envoi du client (Srt)                               311941
Enregistrer détails d'envoi au bureau d'échange (Srt)            93447
Recevoir envoi au bureau d'échange (Srt)                         31734
Expédier envoi à la douane (Ent)                                 23842
Renvoyer envoi de la douane (Ent)                              

In [17]:

# Count the occurrences of each protocol
proto_counts = df['établissement_postal'].value_counts()

print(proto_counts)


établissement_postal
ALGER COLIS POSTAUX    1200525
ALGER GARE             1153144
ÉMIRATS ARABES UNIS     362600
FRANCE                  348397
ANNABA EL MARSA         217177
                        ...   
TAMESNA                      1
BOUYAMINE                    1
BEL-BACHIR                   1
BOU-SFER-PLAGE               1
LAMMAMRA 20 AOUT 55          1
Name: count, Length: 3704, dtype: int64


In [18]:
# Count the occurrences of each protocol
proto_counts = df['next_établissement_postal'].value_counts()

print(proto_counts)



next_établissement_postal
SECTION PAQUETS CPX ALGER    423225
ALGÉRIE                      394132
ALGER GARE                   253072
ORAN COLIS POSTAUX           112598
CONSTANTINE COLIS POSTAUX    107367
                              ...  
VAGUEMESTRE                       1
ZENATA-AEROPORT                   1
ZEGHAIA-ANNASR                    1
OUED EL SADER                     1
ZERIBET-HAMED                     1
Name: count, Length: 3547, dtype: int64


In [19]:

# Count the occurrences of each protocol
proto_counts = df['id'].value_counts()

print(proto_counts)

s = proto_counts


id
SG    4672119
FR    1730991
AE    1390114
DE     134447
ES     114743
       ...   
PE          6
TT          6
GM          5
AO          3
SC          2
Name: count, Length: 125, dtype: int64


In [20]:
# Map the Series index using the country_dict
s.index = s.index.map(lambda code: country_dict.get(code, code))  # Keep unmapped codes as-is
print(s)

id
SINGAPOUR              4672119
FRANCE                 1730991
ÉMIRATS ARABES UNIS    1390114
ALLEMAGNE               134447
ESPAGNE                 114743
                        ...   
PÉROU                        6
TRINITÉ-ET-TOBAGO            6
GAMBIE                       5
ANGOLA                       3
SEYCHELLES                   2
Name: count, Length: 125, dtype: int64


In [21]:
print("Min date:", df["date"].min())
print("Max date:", df["date"].max())

Min date: 2005-11-08 23:29:54
Max date: 2025-05-21 15:50:19


In [22]:
df["date"] = df["date"].str.replace(r'\.\d+$', '', regex=True)
df["date"] = pd.to_datetime(df["date"], infer_datetime_format=True)

AttributeError: Can only use .str accessor with string values!

In [26]:
df = df.sort_values(by=["MAILITM_FID", "date"])

# Calculate time to next step
df["duration_to_next_step"] = df.groupby("MAILITM_FID")["date"].shift(-1) - df["date"]

# Calculate total duration per ID
first_date = df.groupby("MAILITM_FID")["date"].transform("first")
last_date = df.groupby("MAILITM_FID")["date"].transform("last")
df["total_duration"] = last_date - first_date




In [27]:
# Preview the updated dataset
df[["MAILITM_FID", "date", "EVENT_TYPE_NM", "duration_to_next_step", "total_duration"]].head(10)

Unnamed: 0,MAILITM_FID,date,EVENT_TYPE_NM,duration_to_next_step,total_duration
0,CA000679177CY,2023-12-28 12:58:00,Réception d'envoi du client (Srt),4 days 19:09:00,30 days 02:16:34
1,CA000679177CY,2024-01-02 08:07:00,Insérer envoi dans sac (Srt),2 days 03:06:00,30 days 02:16:34
2,CA000679177CY,2024-01-04 11:13:00,Insérer envoi dans sac (Srt),1 days 02:02:00,30 days 02:16:34
3,CA000679177CY,2024-01-05 13:15:00,Expédition d'envoi à l'étranger (EDI-reçu),16 days 18:57:38,30 days 02:16:34
4,CA000679177CY,2024-01-22 08:12:38,Recevoir envoi au bureau d'échange (Ent),0 days 03:55:05,30 days 02:16:34
5,CA000679177CY,2024-01-22 12:07:43,Expédier envoi à adresse nationale (Ent),0 days 01:57:15,30 days 02:16:34
6,CA000679177CY,2024-01-22 14:04:58,Expédier envoi à adresse nationale (Ent),1 days 19:39:18,30 days 02:16:34
7,CA000679177CY,2024-01-24 09:44:16,Recevoir envoi au bureau de livraison (Ent),0 days 00:59:12,30 days 02:16:34
8,CA000679177CY,2024-01-24 10:43:28,Expédier envoi à adresse nationale (Srt),3 days 04:20:30,30 days 02:16:34
9,CA000679177CY,2024-01-27 15:03:58,Vaine tentative de livraison d'envoi (Ent),0 days 00:10:36,30 days 02:16:34


In [37]:
# Filter rows where total_duration is NOT zero
zero_duration_df = df[df["total_duration"] == pd.Timedelta(days = 0)]

# Display the first few rows as a check
zero_duration_df.shape


(0, 9)

In [38]:

# Count the occurrences of each protocol
countries = zero_duration_df['id'].value_counts()

print(countries)


Series([], Name: count, dtype: int64)


In [39]:
# Map the Series index using the country_dict
countries.index = countries.index.map(lambda code: country_dict.get(code, code))  
print(countries)

Series([], Name: count, dtype: int64)


In [40]:
# Step 1: Count how many times each ID appears
id_counts = zero_duration_df["MAILITM_FID"].value_counts()

# Step 2: Count how many IDs have a specific count
count_of_counts = id_counts.value_counts().sort_index()

print(count_of_counts)


Series([], Name: count, dtype: int64)


In [41]:
ids_with_0 = id_counts[id_counts == 2].index
df_with_0 = df[df["MAILITM_FID"].isin(ids_with_0)]
df_with_0.head()

Unnamed: 0,MAILITM_FID,EVENT_TYPE_NM,date,établissement_postal,EVENT_TYPE_CD,next_établissement_postal,id,duration_to_next_step,total_duration


In [42]:

# Count the occurrences of each protocol
event_type = df_with_0['EVENT_TYPE_NM'].value_counts()

print(event_type)



Series([], Name: count, dtype: int64)


In [43]:
# Drop rows where total_duration is zero
df = df[df["total_duration"] != pd.Timedelta(0)]

# Check the shape after dropping
print(df.shape)


(8700805, 9)


In [50]:
df.to_csv(file_name, index=False, encoding='utf-8-sig')