In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [16]:
import pandas as pd

df = pd.read_csv("code_etablissement.csv")

# Preview to confirm
df.head()


Unnamed: 0,bp_num_ips,bp_num_tm,code_upw,upw,bp_nm,type_etablissement,postal_cd
0,3684.0,664,35,UPW BOUMERDES,BORDJ MENAEIL,ETAB,35001
1,354.0,665,35,UPW BOUMERDES,BOUDOUAOU,ETAB,35003
2,898.0,668,35,UPW BOUMERDES,ZEMMOURI,ETAB,35012
3,351.0,669,35,UPW BOUMERDES,THENIA,ETAB,35005
4,3698.0,670,35,UPW BOUMERDES,HEMMADI,ETAB,35015


In [17]:
df1 = pd.read_csv("delai_international.csv")
df1.head()

Unnamed: 0,orig_wilaya_cd,dest_wilaya_cd,type_envoi,delai
0,16,1,C,9
1,16,1,U,9
2,16,2,C,7
3,16,2,U,6
4,16,3,C,9


In [4]:
import pandas as pd
import gc

# --- Load the reference file ---

bp_nm_set = set(df["bp_nm"].dropna().unique())

FILES = [
    "df_count_1.csv", 
    "df_count_2_part1.csv", 
    "df_count_2_part2.csv", 
    "df_count_3_part1.csv", 
    "df_count_3_part2.csv", 
    "df_count_4.csv"
]
ETAB_COL = "établissement_postal"
NEXT_ETAB_COL = "next_établissement_postal"

missing_etab = set()
missing_next_etab = set()

for src in FILES:
    print(f"→ reading {src}")
    df = pd.read_csv(src, usecols=[ETAB_COL, NEXT_ETAB_COL])
    
    # Find missing établissement_postal
    missing_etab.update(set(df[ETAB_COL].dropna().unique()) - bp_nm_set)
    # Find missing next_établissement_postal
    missing_next_etab.update(set(df[NEXT_ETAB_COL].dropna().unique()) - bp_nm_set)
    
    del df
    gc.collect()

print(f"\nNumber of unique établissement_postal NOT in bp_nm: {len(missing_etab)}")
print("Sample (up to 10):", list(missing_etab)[:10])

print(f"\nNumber of unique next_établissement_postal NOT in bp_nm: {len(missing_next_etab)}")
print("Sample (up to 10):", list(missing_next_etab)[:10])


→ reading df_count_1.csv
→ reading df_count_2_part1.csv
→ reading df_count_2_part2.csv
→ reading df_count_3_part1.csv
→ reading df_count_3_part2.csv
→ reading df_count_4.csv

Number of unique établissement_postal NOT in bp_nm: 796
Sample (up to 10): ['MALAISIE', 'EL-MENIAA-RP', 'BENI CHOUGRANE', 'SIDI LARBI', 'BORDJ MHIRIS', 'FAR ALLAH', 'HARICHA', 'BIR SNAB', 'SIDI BOUABIDA', 'EMS ALGER CPX']

Number of unique next_établissement_postal NOT in bp_nm: 698
Sample (up to 10): ['EL-MENIAA-RP', 'BENI CHOUGRANE', 'SECTION PAQUETS CPX ALGER', 'SIDI LARBI', 'BORDJ MHIRIS', 'FAR ALLAH', 'HARICHA', 'BIR SNAB', 'SIDI BOUABIDA', 'EMS ALGER CPX']


In [1]:
import pandas as pd
import gc

# --- Step 1: Build the reference set from the external file ---
df_ref = pd.read_csv("code_etablissement.csv")  # adjust filename if needed

bp_nm_set = set(df_ref.loc[df_ref['code_upw'].isin([16, 76, 77]), 'bp_nm'].dropna().unique())


FILES = [
    "df_count_1.csv", 
    "df_count_2_part1.csv", 
    "df_count_2_part2.csv", 
    "df_count_3_part1.csv", 
    "df_count_3_part2.csv", 
    "df_count_4.csv"
]
PACKAGE_COL = "MAILITM_FID"   # change if your ID column is named differently
ETAB_COL = "établissement_postal"

# --- Step 2: Build a mapping from package_id to all its établissement_postal values ---
package_etabs = dict()  # {package_id: set of établissement_postal}

for src in FILES:
    print(f"→ reading {src}")
    df = pd.read_csv(src, usecols=[PACKAGE_COL, ETAB_COL])
    
    for pkg, etab in zip(df[PACKAGE_COL], df[ETAB_COL]):
        if pd.isna(pkg) or pd.isna(etab): continue
        package_etabs.setdefault(pkg, set()).add(etab)
    
    del df
    gc.collect()

# --- Step 3: Check which packages have at least one matching établissement_postal ---
matching_packages = [
    pkg for pkg, etabs in package_etabs.items()
    if any(e in bp_nm_set for e in etabs)
]

total_packages = len(package_etabs)
matching_count = len(matching_packages)
rate = matching_count / total_packages if total_packages > 0 else 0

print(f"\nTotal unique packages: {total_packages}")
print(f"Packages with at least one établissement_postal in reference set: {matching_count}")
print(f"Rate: {rate:.2%}")

# Optionally, print a sample
print("Sample matching package IDs:", matching_packages[:10])

# Or save the list
# pd.Series(matching_packages).to_csv("matching_packages.csv", index=False)


→ reading df_count_1.csv
→ reading df_count_2_part1.csv
→ reading df_count_2_part2.csv
→ reading df_count_3_part1.csv
→ reading df_count_3_part2.csv
→ reading df_count_4.csv

Total unique packages: 4567659
Packages with at least one établissement_postal in reference set: 4181009
Rate: 91.54%
Sample matching package IDs: ['CA000422162US', 'CA000829398US', 'CA001296001JE', 'CA001299630JE', 'CA001299643JE', 'CA001299657JE', 'CA001315111BF', 'CA001460808BF', 'CA001596465CY', 'CA001824145CY']


In [16]:
# Find non-matching packages
non_matching_packages = [
    pkg for pkg, etabs in package_etabs.items()
    if not any(e in bp_nm_set for e in etabs)
]

# Save the list to CSV
pd.Series(non_matching_packages, name=PACKAGE_COL).to_csv("non_matching_packages_ID's_only.csv", index=False)
print("Saved non-matching package IDs to non_matching_packages.csv")


Saved non-matching package IDs to non_matching_packages.csv


In [17]:
import pandas as pd
import gc

# Assume non_matching_packages is already defined as a list/set

# Take a random sample of up to 40 package IDs
import random
sample_size = min(100, len(non_matching_packages))
sampled_packages = random.sample(non_matching_packages, sample_size)

# Collect all rows for these packages from all files
rows = []
columns_needed = ["MAILITM_FID", "date", "établissement_postal", "EVENT_TYPE_CD", "next_établissement_postal"]

for src in FILES:
    print(f"→ searching {src}")
    df = pd.read_csv(src)
    df_sample = df[df["MAILITM_FID"].isin(sampled_packages)]
    if not df_sample.empty:
        rows.append(df_sample)
    del df, df_sample
    gc.collect()

# Combine and save
if rows:
    final_df = pd.concat(rows, ignore_index=True)
    final_df.to_csv("sample_non_matching_packages_rows.csv", index=False)
    print("Saved sample rows to sample_non_matching_packages_rows.csv")
else:
    print("No matching rows found for the sampled packages.")


→ searching df_count_1.csv
→ searching df_count_2_part1.csv
→ searching df_count_2_part2.csv
→ searching df_count_3_part1.csv
→ searching df_count_3_part2.csv
→ searching df_count_4.csv
Saved sample rows to sample_non_matching_packages_rows.csv


In [18]:
final_df.head()

Unnamed: 0,MAILITM_FID,EVENT_TYPE_NM,date,établissement_postal,EVENT_TYPE_CD,next_établissement_postal,id,duration_to_next_step,total_duration
0,CI605601555DE,Réception d'envoi du client (Srt),2025-01-31 15:25:00,ALLEMAGNE,1,,DE,0 days 17:35:00,2 days 20:05:00
1,CI605601555DE,Insérer envoi dans sac (Srt),2025-02-01 09:00:00,ALLEMAGNE,8,,DE,2 days 02:30:00,2 days 20:05:00
2,CI605601555DE,Expédition d'envoi à l'étranger (EDI-reçu),2025-02-03 11:30:00,ALLEMAGNE,12,ALGÉRIE,DE,,2 days 20:05:00
3,CY584998953DE,Réception d'envoi du client (Srt),2024-04-30 17:13:00,ALLEMAGNE,1,,DE,2 days 15:50:00,4 days 19:17:00
4,CY584998953DE,Insérer envoi dans sac (Srt),2024-05-03 09:03:00,ALLEMAGNE,8,,DE,2 days 03:27:00,4 days 19:17:00


In [12]:
import pandas as pd

df = pd.read_csv("CT_COUNTRIES.csv", sep=';')

# Preview to confirm
df.head()


Unnamed: 0,AD,FR,ANDORRE
0,AE,FR,ÉMIRATS ARABES UNIS
1,AF,FR,AFGHANISTAN
2,AG,FR,ANTIGUA-ET-BARBUDA
3,AI,FR,ANGUILLA
4,AL,FR,ALBANIE


In [19]:
import pandas as pd
import gc

# --- Load reference sets ---
df_ref = pd.read_csv("code_etablissement.csv")
ct_countries = pd.read_csv("CT_COUNTRIES.csv", sep=';')

bp_nm_set = set(df_ref["bp_nm"].dropna().unique())
andorre_set = set(ct_countries["ANDORRE"].dropna().unique())

FILES = [
    "df_count_1.csv", 
    "df_count_2_part1.csv", 
    "df_count_2_part2.csv", 
    "df_count_3_part1.csv", 
    "df_count_3_part2.csv", 
    "df_count_4.csv"
]

all_etab = set()
all_next_etab = set()

for src in FILES:
    print(f"→ reading {src}")
    df = pd.read_csv(src, usecols=["établissement_postal", "next_établissement_postal"])
    all_etab.update(df["établissement_postal"].dropna().unique())
    all_next_etab.update(df["next_établissement_postal"].dropna().unique())
    del df
    gc.collect()

# Union of both columns
all_unique_etabs = all_etab | all_next_etab

# Here’s the key logic: keep only those NOT found in EITHER reference set
missing_anywhere = {e for e in all_unique_etabs if e not in bp_nm_set and e not in andorre_set}

print(f"\nNumber of unique établissement_postal OR next_établissement_postal NOT in bp_nm OR ANDORRE: {len(missing_anywhere)}")
print("Sample (up to 10):", list(missing_anywhere)[:10])


→ reading df_count_1.csv
→ reading df_count_2_part1.csv
→ reading df_count_2_part2.csv
→ reading df_count_3_part1.csv
→ reading df_count_3_part2.csv
→ reading df_count_4.csv

Number of unique établissement_postal OR next_établissement_postal NOT in bp_nm OR ANDORRE: 724
Sample (up to 10): ['EL-MENIAA-RP', 'BENI CHOUGRANE', 'SECTION PAQUETS CPX ALGER', 'BOUSSOUF', 'AIN TAYA', 'SIDI LARBI', 'MDOUKAL', 'KHEMIS MILIANA SOUAMAA', 'BORDJ MHIRIS', 'OULED CHEBEL']


In [21]:
# Save missing_anywhere as a DataFrame and write to CSV
missing_df = pd.DataFrame({"missing_name": list(missing_anywhere)})

missing_df.to_csv("missing_etab_or_andorre.csv", index=False)
print("Saved missing names to missing_etab_or_andorre.csv")


Saved missing names to missing_etab_or_andorre.csv


In [20]:
total_unique = len(all_unique_etabs)
missing_count = len(missing_anywhere)
rate = missing_count / total_unique if total_unique > 0 else 0

print(f"\nTotal unique établissement_postal OR next_établissement_postal: {total_unique}")
print(f"Number NOT in bp_nm OR ANDORRE: {missing_count}")
print(f"Rate: {rate:.2%}")



Total unique établissement_postal OR next_établissement_postal: 4226
Number NOT in bp_nm OR ANDORRE: 724
Rate: 17.13%


In [1]:
import pandas as pd
import gc

# --- Load reference sets ---
df_ref = pd.read_csv("code_etablissement.csv")
ct_countries = pd.read_csv("CT_COUNTRIES.csv", sep=';')

bp_nm_set = set(df_ref["bp_nm"].dropna().unique())
andorre_set = set(ct_countries["ANDORRE"].dropna().unique())

FILES = [
    "df_count_1.csv", 
    "df_count_2_part1.csv", 
    "df_count_2_part2.csv", 
    "df_count_3_part1.csv", 
    "df_count_3_part2.csv", 
    "df_count_4.csv"
]

all_etab = set()

for src in FILES:
    print(f"→ reading {src}")
    df = pd.read_csv(src, usecols=["établissement_postal"])
    all_etab.update(df["établissement_postal"].dropna().unique())
    del df
    gc.collect()

# Only check établissement_postal values
missing_etab = {e for e in all_etab if e not in bp_nm_set and e not in andorre_set}

print(f"\nNumber of unique établissement_postal NOT in bp_nm OR ANDORRE: {len(missing_etab)}")
print("Sample (up to 10):", list(missing_etab)[:10])

# Save to DataFrame if needed
missing_df = pd.DataFrame({"missing_etablissement_postal": list(missing_etab)})



→ reading df_count_1.csv
→ reading df_count_2_part1.csv
→ reading df_count_2_part2.csv
→ reading df_count_3_part1.csv
→ reading df_count_3_part2.csv
→ reading df_count_4.csv

Number of unique établissement_postal NOT in bp_nm OR ANDORRE: 664
Sample (up to 10): ['OULED RAHMOUN GARE', 'EL OUED RP', 'SIDI SALAH', 'EMS ALGER CPX', 'EL MARSA', 'Agence EMS La pêcherie', 'SIDI MEDJAHED', 'OUED KOUBA', 'AIN LAHDJAR', 'KHROUB CENTRE']


In [2]:
total_unique = len(all_etab)
missing_count = len(missing_etab)
rate = missing_count / total_unique if total_unique > 0 else 0

print(f"\nTotal unique établissement_postal: {total_unique}")
print(f"Number NOT in bp_nm OR ANDORRE: {missing_count}")
print(f"Rate: {rate:.2%}")



Total unique établissement_postal: 4009
Number NOT in bp_nm OR ANDORRE: 664
Rate: 16.56%


In [3]:
df_test = pd.read_csv("df_test.csv")

In [4]:
# Assume: missing_etab is already defined (set of missing établissements)
# df_test is loaded, with columns "package_id" and "établissement_postal"

package_col = "MAILITM_FID"  # change if your package column is named differently

# All unique packages in df_test
all_packages = set(df_test[package_col].dropna().unique())

# Packages with at least one missing établissement_postal
packages_with_missing = set(
    df_test.loc[df_test["établissement_postal"].isin(missing_etab), package_col].dropna().unique()
)

num_with_missing = len(packages_with_missing)
total_packages = len(all_packages)
rate = num_with_missing / total_packages if total_packages > 0 else 0

print(f"\nTotal unique packages in df_test: {total_packages}")
print(f"Packages with at least one missing établissement_postal: {num_with_missing}")
print(f"Rate: {rate:.2%}")
print("Sample package IDs:", list(packages_with_missing)[:10])

# Optionally, save the list
# pd.Series(list(packages_with_missing)).to_csv("packages_with_missing_etab.csv", index=False)



Total unique packages in df_test: 8528
Packages with at least one missing établissement_postal: 5020
Rate: 58.86%
Sample package IDs: ['CC094088865NL', 'CA589504354DE', 'CC640681921SE', 'CE021269478SA', 'CJ106006786DE', 'CF154200555AU', 'CC640582107SE', 'CE019269545SA', 'CC640512684SE', 'CC639400045SE']


In [8]:
import pandas as pd

# Load missing_etab if not already in memory
# missing_df = pd.read_csv("missing_etablissement_postal.csv")
# missing_etab = set(missing_df["missing_etablissement_postal"].dropna().unique())

package_col = "MAILITM_FID"  # Change to your actual package ID column name

# Choose the file to use
df = pd.read_csv("df_count_3_part1.csv", usecols=[package_col, "établissement_postal"])

# Group by package and get all their établissements as sets
pkg_to_etabs = df.groupby(package_col)["établissement_postal"].apply(set)

# Keep only packages where all établissements are NOT in missing_etab
valid_packages = [pkg for pkg, etabs in pkg_to_etabs.items() if etabs.isdisjoint(missing_etab)]

# Take the first 10,000 unique package IDs
sampled_packages = valid_packages[:10000]

print(f"Number of packages with ALL établissements valid: {len(valid_packages)}")
print("Sample of 10:", sampled_packages[:10])

# Optionally, save the list
pd.Series(sampled_packages).to_csv("packages_all_etab_valid.csv", index=False)


Number of packages with ALL établissements valid: 27244
Sample of 10: ['CA000623155BF', 'CA000644495PT', 'CA000750018PT', 'CA000788279PT', 'CA000838655PT', 'CA000915176PT', 'CA000961017PT', 'CA000979110PT', 'CA001036094US', 'CA001070351PT']


In [17]:
import pandas as pd

# Load the list of valid package IDs (these are MAILITM_FID values)
sampled_packages = pd.read_csv("packages_all_etab_valid.csv", header=None)[0].tolist()


# These are your columns to keep
columns_to_keep = [
    "MAILITM_FID",
    "date",
    "établissement_postal",
    "EVENT_TYPE_CD",
    "next_établissement_postal",
    "EVENT_TYPE_NM"

]

# Read the relevant columns from the file
df = pd.read_csv("df_count_3_part1.csv", usecols=columns_to_keep)

# Filter rows for only those MAILITM_FID in your sampled_packages
df = df[df["MAILITM_FID"].isin(sampled_packages)]

# Add RECPTCL_FID column filled with missing values (pd.NA)
df["RECPTCL_FID"] = pd.NA

# Save to CSV
df.to_csv("filtered_data_with_recptcl.csv", index=False)
print("Saved as filtered_data_with_recptcl.csv")


Saved as filtered_data_with_recptcl.csv


In [18]:
print("Number of rows:", len(df))
print("Number of unique packages (MAILITM_FID):", df["MAILITM_FID"].nunique())


Number of rows: 115982
Number of unique packages (MAILITM_FID): 10000


In [19]:
df = pd.read_csv("filtered_data_with_recptcl.csv")
df.head()

Unnamed: 0,MAILITM_FID,EVENT_TYPE_NM,date,établissement_postal,EVENT_TYPE_CD,next_établissement_postal,RECPTCL_FID
0,CA000623155BF,Réception d'envoi du client (Srt),2023-12-12 16:05:00,BURKINA FASO,1,,
1,CA000623155BF,Insérer envoi dans sac (Srt),2024-01-24 15:20:00,BURKINA FASO,8,,
2,CA000623155BF,Expédition d'envoi à l'étranger (EDI-reçu),2024-03-23 12:00:00,BURKINA FASO,12,ALGÉRIE,
3,CA000623155BF,Recevoir envoi au bureau d'échange (Ent),2024-03-30 11:07:23,ALGER COLIS POSTAUX,30,,
4,CA000623155BF,Expédier envoi à adresse nationale (Ent),2024-03-31 10:26:06,ALGER COLIS POSTAUX,35,DRARIA,


In [20]:
import pandas as pd

# Load the filtered data
df = pd.read_csv("filtered_data_with_recptcl.csv")

# Build the reference set
df_ref = pd.read_csv("code_etablissement.csv")
bp_nm_set = set(df_ref.loc[df_ref['code_upw'].isin([16, 76, 77]), 'bp_nm'].dropna().unique())

# Find unique IDs where at least one row matches
ids_with_match = df.loc[df['établissement_postal'].isin(bp_nm_set), 'MAILITM_FID'].unique()
total_ids = df['MAILITM_FID'].nunique()
matching_count = len(ids_with_match)
rate = matching_count / total_ids if total_ids > 0 else 0

print(f"Total unique MAILITM_FID: {total_ids}")
print(f"Unique MAILITM_FID with at least one matching row: {matching_count}")
print(f"Rate: {rate:.2%}")
print("Sample matching IDs:", ids_with_match[:10])


Total unique MAILITM_FID: 10000
Unique MAILITM_FID with at least one matching row: 9992
Rate: 99.92%
Sample matching IDs: ['CA000623155BF' 'CA000644495PT' 'CA000750018PT' 'CA000788279PT'
 'CA000838655PT' 'CA000915176PT' 'CA000961017PT' 'CA000979110PT'
 'CA001036094US' 'CA001070351PT']


In [21]:
import pandas as pd

# Function to normalize Algiers UPW codes
def normalize_algiers(upw):
    return 16 if upw in [16, 76, 77] else upw

# Normalize code_upw in your events dataframe
df_events["normalized_upw"] = df_events["code_upw"].apply(normalize_algiers)

# Count unique normalized UPWs for each package
package_upw_counts = df_events.groupby("MAILITM_FID")["normalized_upw"].nunique()

# Find packages with more than 2 distinct normalized UPWs
packages_with_many_upw = package_upw_counts[package_upw_counts > 2].index.tolist()

print(f"Number of packages with more than 2 distinct UPWs: {len(packages_with_many_upw)}")
print("Sample package IDs:", packages_with_many_upw[:10])


Number of packages with more than 2 distinct UPWs: 4300
Sample package IDs: ['CA000750018PT', 'CA000788279PT', 'CA000838655PT', 'CA000915176PT', 'CA000961017PT', 'CA001070351PT', 'CA001612548PT', 'CA001756745PT', 'CA002077183PT', 'CA045574790RU']


In [22]:
import pandas as pd

# Function to normalize Algiers UPW codes
def normalize_algiers(upw):
    return 16 if upw in [16, 76, 77] else upw

# Normalize code_upw in your events dataframe
df_events["normalized_upw"] = df_events["code_upw"].apply(normalize_algiers)

# Count unique normalized UPWs for each package
package_upw_counts = df_events.groupby("MAILITM_FID")["normalized_upw"].nunique()

# Find packages with more than 2 distinct normalized UPWs
packages_with_many_upw = package_upw_counts[package_upw_counts > 2].index.tolist()

print(f"Number of packages with more than 2 distinct UPWs: {len(packages_with_many_upw)}")
print("Sample package IDs:", packages_with_many_upw[:10])


Number of packages with more than 2 distinct UPWs: 4300
Sample package IDs: ['CA000750018PT', 'CA000788279PT', 'CA000838655PT', 'CA000915176PT', 'CA000961017PT', 'CA001070351PT', 'CA001612548PT', 'CA001756745PT', 'CA002077183PT', 'CA045574790RU']


In [23]:
import pandas as pd

# Load filtered data
df = pd.read_csv("filtered_data_with_recptcl.csv")

# Build reference set (already done above, keep for clarity)
df_ref = pd.read_csv("code_etablissement.csv")
bp_nm_set = set(df_ref.loc[df_ref['code_upw'].isin([16, 76, 77]), 'bp_nm'].dropna().unique())

# Find matching package IDs
ids_with_match = df.loc[df['établissement_postal'].isin(bp_nm_set), 'MAILITM_FID'].unique()

# Filter DataFrame for those IDs
df_matching = df[df['MAILITM_FID'].isin(ids_with_match)]

# Save to CSV
df_matching.to_csv("filtered_data_with_matching_id.csv", index=False)
print("Saved as filtered_data_with_matching_id.csv")


Saved as filtered_data_with_matching_id.csv


# DELAI COUNTING

In [24]:
df = pd.read_csv("filtered_data_with_matching_id.csv")

In [25]:
df.head()

Unnamed: 0,MAILITM_FID,EVENT_TYPE_NM,date,établissement_postal,EVENT_TYPE_CD,next_établissement_postal,RECPTCL_FID
0,CA000623155BF,Réception d'envoi du client (Srt),2023-12-12 16:05:00,BURKINA FASO,1,,
1,CA000623155BF,Insérer envoi dans sac (Srt),2024-01-24 15:20:00,BURKINA FASO,8,,
2,CA000623155BF,Expédition d'envoi à l'étranger (EDI-reçu),2024-03-23 12:00:00,BURKINA FASO,12,ALGÉRIE,
3,CA000623155BF,Recevoir envoi au bureau d'échange (Ent),2024-03-30 11:07:23,ALGER COLIS POSTAUX,30,,
4,CA000623155BF,Expédier envoi à adresse nationale (Ent),2024-03-31 10:26:06,ALGER COLIS POSTAUX,35,DRARIA,


In [26]:
import pandas as pd

# Load your files
df_data = pd.read_csv("filtered_data_with_matching_id.csv")
df_etab = pd.read_csv("code_etablissement.csv")

# Pick one package ID to try (replace with an actual ID you want to test)
example_package = df_data["MAILITM_FID"].iloc[0]  # Or put an ID like: 'CA000623155BF'

# 1. Get all etablissement_postal for the package
etabs = df_data[df_data["MAILITM_FID"] == example_package]["établissement_postal"].unique()

print(f"Etablissements visited by {example_package}: {etabs}")

# 2. For each etablissement, get code_upw from the reference file
upw_codes = (
    df_etab[df_etab["bp_nm"].isin(etabs)]["code_upw"]
    .dropna().unique()
)

print(f"UPW codes visited by {example_package}: {upw_codes}")

# 3. If you want, print as a list
print("List of UPW codes:", list(upw_codes))


Etablissements visited by CA000623155BF: ['BURKINA FASO' 'ALGER COLIS POSTAUX' 'DRARIA']
UPW codes visited by CA000623155BF: [77 16]
List of UPW codes: [77, 16]


In [27]:
import pandas as pd

# Load the data
df_data = pd.read_csv("filtered_data_with_matching_id.csv")

# Use one package as an example
example_package = df_data["MAILITM_FID"].iloc[0]  # or set to a specific ID
df_pkg = df_data[df_data["MAILITM_FID"] == example_package].copy()

# Make sure 'date' is datetime
df_pkg["date"] = pd.to_datetime(df_pkg["date"])

# Sort by date
df_pkg = df_pkg.sort_values("date")

# Find blocks of each etablissement
blocks = []
for etab, group in df_pkg.groupby("établissement_postal", sort=False):
    times = group["date"].sort_values()
    blocks.append({
        "etab": etab,
        "first_time": times.iloc[0],
        "last_time": times.iloc[-1]
    })

# Now calculate transitions
for i in range(len(blocks) - 1):
    prev = blocks[i]
    nxt = blocks[i + 1]
    period = nxt["first_time"] - prev["last_time"]
    print(
        f"{prev['etab']} → {nxt['etab']}: Exit {prev['last_time']}, "
        f"Entry {nxt['first_time']}, Period: {period}"
    )


BURKINA FASO → ALGER COLIS POSTAUX: Exit 2024-03-23 12:00:00, Entry 2024-03-30 11:07:23, Period: 6 days 23:07:23
ALGER COLIS POSTAUX → DRARIA: Exit 2024-03-31 10:26:06, Entry 2024-04-01 09:56:00, Period: 0 days 23:29:54


In [28]:
df_data = pd.read_csv("filtered_data_with_matching_id.csv")
df_data.head()

Unnamed: 0,MAILITM_FID,EVENT_TYPE_NM,date,établissement_postal,EVENT_TYPE_CD,next_établissement_postal,RECPTCL_FID
0,CA000623155BF,Réception d'envoi du client (Srt),2023-12-12 16:05:00,BURKINA FASO,1,,
1,CA000623155BF,Insérer envoi dans sac (Srt),2024-01-24 15:20:00,BURKINA FASO,8,,
2,CA000623155BF,Expédition d'envoi à l'étranger (EDI-reçu),2024-03-23 12:00:00,BURKINA FASO,12,ALGÉRIE,
3,CA000623155BF,Recevoir envoi au bureau d'échange (Ent),2024-03-30 11:07:23,ALGER COLIS POSTAUX,30,,
4,CA000623155BF,Expédier envoi à adresse nationale (Ent),2024-03-31 10:26:06,ALGER COLIS POSTAUX,35,DRARIA,


In [29]:
import pandas as pd

# Load your event data and the etablissement lookup
df_data = pd.read_csv("filtered_data_with_matching_id.csv")
df_etab = pd.read_csv("code_etablissement.csv")

# Use one package as an example
example_package = df_data["MAILITM_FID"].iloc[10]  # Or any package ID

# Merge code_upw into your package events
df_pkg = df_data[df_data["MAILITM_FID"] == example_package].copy()
df_pkg = df_pkg.merge(df_etab[["bp_nm", "code_upw"]], left_on="établissement_postal", right_on="bp_nm", how="left")

# Make sure date is datetime
df_pkg["date"] = pd.to_datetime(df_pkg["date"])

# Sort by time
df_pkg = df_pkg.sort_values("date")

# Find "blocks" of each consecutive code_upw
blocks = []
for code_upw, group in df_pkg.groupby("code_upw", sort=False):
    times = group["date"].sort_values()
    blocks.append({
        "code_upw": code_upw,
        "first_time": times.iloc[0],
        "last_time": times.iloc[-1]
    })

# Calculate periods between consecutive code_upw transitions
for i in range(len(blocks) - 1):
    prev = blocks[i]
    nxt = blocks[i + 1]
    period = nxt["first_time"] - prev["last_time"]
    print(
        f"UPW {prev['code_upw']} → UPW {nxt['code_upw']}: Exit {prev['last_time']}, "
        f"Entry {nxt['first_time']}, Period: {period}"
    )


UPW 16.0 → UPW 6.0: Exit 2022-01-25 10:14:46, Entry 2022-01-31 10:56:24, Period: 6 days 00:41:38


In [30]:
import pandas as pd

# Load your table
df = pd.read_csv("delai_international.csv")

# Build lookup: (origin, dest, type_envoi) → delai
# (for 16 → X, 16 → 76, 16 → 77)
lookup = {}
for _, row in df.iterrows():
    lookup[(row["orig_wilaya_cd"], row["dest_wilaya_cd"], row["type_envoi"])] = row["delai"]

def get_delai(origin, dest, type_envoi):
    # If origin in (76, 77), treat as 16
    if origin in [76, 77]:
        origin = 16
    # If dest in (16, 76, 77) and origin not 16/76/77, treat as 16→origin (reverse)
    if dest in [16, 76, 77] and origin not in [16, 76, 77]:
        return lookup.get((16, origin, type_envoi))
    # Otherwise: normal lookup
    return lookup.get((origin, dest, type_envoi))

# === EXAMPLES ===
# 16 → 25: normal
print("16→25 U:", get_delai(16, 25, "U"))
# 76 → 25: treat as 16 → 25
print("76→25 U:", get_delai(76, 25, "U"))
# 16 → 76: as in table
print("16→76 U:", get_delai(16, 76, "U"))
# 25 → 16: treat as 16 → 25
print("25→16 U:", get_delai(25, 16, "U"))
# 25 → 77: treat as 16 → 25
print("25→77 U:", get_delai(25, 77, "U"))
# 25 → 19: normal lookup
print("25→19 U:", get_delai(25, 19, "U"))


16→25 U: 6
76→25 U: 6
16→76 U: 5
25→16 U: 6
25→77 U: 6
25→19 U: None


In [31]:
import pandas as pd

# 1. Load reference data
df_etab = pd.read_csv("code_etablissement.csv")
df_delai = pd.read_csv("delai_international.csv")
df_events = pd.read_csv("filtered_data_with_matching_id.csv")

# 2. Product type functions
PRODUCT_TYPE_MAP = {
    "EMS":       ("EA", "EZ"),
    "Letter Post Tracked": ("LA", "LZ"),
    "M bags":    ("MA", "MZ"),
    "IBRS":      ("QA", "QM"),
    "Letter Post Registered": ("RA", "RZ"),
    "Letter Post (goods)": ("UA", "UZ"),
    "Letter Post Insured": ("VA", "VZ"),
    "Parcel Post": ("CA", "CZ"),
    "ECOMPRO Parcel": ("HA", "HZ"),
}
def get_product_type(mailitm_fid):
    if not isinstance(mailitm_fid, str) or len(mailitm_fid) < 2:
        return "UNKNOWN"
    indicator = mailitm_fid[:2].upper()
    for ptype, (start, end) in PRODUCT_TYPE_MAP.items():
        if start <= indicator <= end:
            return ptype
    return "Other/Unknown"
def get_type_envoi_from_product(product_type):
    if product_type in ["Parcel Post", "ECOMPRO Parcel"]:
        return "C"
    else:
        return "U"

# 3. Add type columns
df_events["product_type"] = df_events["MAILITM_FID"].apply(get_product_type)
df_events["type_envoi"] = df_events["product_type"].apply(get_type_envoi_from_product)

# Merge UPW codes for each event
df_events = df_events.merge(df_etab[["bp_nm", "code_upw"]], left_on="établissement_postal", right_on="bp_nm", how="left")

# 4. Prepare delai lookup with business rules
def normalize_algiers(val):
    return 16 if val in {16, 76, 77} else val

delai_lookup = {}
for _, row in df_delai.iterrows():
    key = (
        normalize_algiers(row["orig_wilaya_cd"]),
        normalize_algiers(row["dest_wilaya_cd"]),
        row["type_envoi"]
    )
    delai_lookup[key] = row["delai"]

def get_delai(origin, dest, type_envoi):
    # For Algiers variants as origin
    if origin in [76, 77]:
        origin = 16
    # For Algiers as destination and other origin, reverse lookup
    if dest in [16, 76, 77] and origin not in [16, 76, 77]:
        return delai_lookup.get((16, origin, type_envoi))
    # Otherwise, normal lookup
    return delai_lookup.get((origin, dest, type_envoi))

# 5. Analyze 100 sample packages
import random
unique_packages = df_events["MAILITM_FID"].unique()
sample_packages = random.sample(list(unique_packages), min(100, len(unique_packages)))
results = []

for package_id in sample_packages:
    pkg_df = df_events[df_events["MAILITM_FID"] == package_id].sort_values("date")
    pkg_df = pkg_df[~pkg_df["code_upw"].isna()]
    code_upw_seq = pkg_df["code_upw"].tolist()
    time_seq = pd.to_datetime(pkg_df["date"]).tolist()
    type_envoi = pkg_df["type_envoi"].iloc[0]
    for i in range(len(code_upw_seq)-1):
        prev_upw = code_upw_seq[i]
        next_upw = code_upw_seq[i+1]
        if prev_upw == next_upw:
            continue
        exit_time = time_seq[i]
        entry_time = time_seq[i+1]
        period_days = (entry_time - exit_time).days
        allowed_delai = get_delai(prev_upw, next_upw, type_envoi)
        status = "EXCEEDS DELAI" if allowed_delai is not None and period_days > allowed_delai else "OK"
        results.append({
            "MAILITM_FID": package_id,
            "from_code_upw": prev_upw,
            "to_code_upw": next_upw,
            "exit_time": exit_time,
            "entry_time": entry_time,
            "period_days": period_days,
            "allowed_delai": allowed_delai,
            "type_envoi": type_envoi,
            "status": status
        })

# 6. Show a sample of results (first 20 transitions as a preview)
import pandas as pd
results_df = pd.DataFrame(results)
results_df.head(20)


Unnamed: 0,MAILITM_FID,from_code_upw,to_code_upw,exit_time,entry_time,period_days,allowed_delai,type_envoi,status
0,CP100842763LU,16.0,31.0,2022-08-10 10:57:45,2022-08-14 11:06:38,4,7.0,C,OK
1,CY558121111DE,16.0,2.0,2022-01-16 10:42:57,2022-01-22 09:54:31,5,7.0,C,OK
2,CY558121111DE,2.0,38.0,2022-01-22 10:15:36,2022-01-23 10:01:41,0,,C,OK
3,CG016051668DE,16.0,25.0,2022-11-13 09:47:30,2022-11-15 11:53:58,2,7.0,C,OK
4,CG016051668DE,25.0,4.0,2022-11-15 11:53:58,2022-11-19 08:16:05,3,,C,OK
5,CY561286172DE,16.0,31.0,2022-11-13 09:10:26,2022-11-15 08:55:56,1,7.0,C,OK
6,CG014584960DE,16.0,25.0,2022-07-17 09:40:10,2022-07-20 09:41:32,3,7.0,C,OK
7,CG014584960DE,25.0,18.0,2022-07-20 09:41:32,2022-07-21 11:48:42,1,,C,OK
8,CY561249740DE,16.0,31.0,2022-06-06 10:09:10,2022-06-08 09:53:22,1,7.0,C,OK
9,CY561249740DE,31.0,13.0,2022-06-08 10:39:37,2022-06-11 09:19:10,2,,C,OK


In [32]:
import pandas as pd

# Function to normalize Algiers UPW codes
def normalize_algiers(upw):
    return 16 if upw in [16, 76, 77] else upw

# Normalize code_upw in your events dataframe
df_events["normalized_upw"] = df_events["code_upw"].apply(normalize_algiers)

# Count unique normalized UPWs for each package
package_upw_counts = df_events.groupby("MAILITM_FID")["normalized_upw"].nunique()

# Find packages with more than 2 distinct normalized UPWs
packages_with_many_upw = package_upw_counts[package_upw_counts > 2].index.tolist()

print(f"Number of packages with more than 2 distinct UPWs: {len(packages_with_many_upw)}")
print("Sample package IDs:", packages_with_many_upw[:10])


Number of packages with more than 2 distinct UPWs: 4300
Sample package IDs: ['CA000750018PT', 'CA000788279PT', 'CA000838655PT', 'CA000915176PT', 'CA000961017PT', 'CA001070351PT', 'CA001612548PT', 'CA001756745PT', 'CA002077183PT', 'CA045574790RU']


In [33]:
import pandas as pd

def is_round_trip_to_algiers(upw_seq):
    algiers_set = {16, 76, 77}
    seq_norm = [16 if x in algiers_set else x for x in upw_seq if pd.notna(x)]
    algiers_indexes = [i for i, x in enumerate(seq_norm) if x == 16]
    # Need at least two, not consecutive at start
    return len(algiers_indexes) >= 2 and (algiers_indexes[-1] > algiers_indexes[0] + 1)

# Make sure code_upw is present and normalized
df_events["normalized_upw"] = df_events["code_upw"].apply(lambda x: 16 if x in [16, 76, 77] else x)

# Group by package, check for Algiers → other → Algiers pattern
round_trip_packages = []
for pkg, group in df_events.groupby("MAILITM_FID"):
    seq = group.sort_values("date")["normalized_upw"].tolist()
    if is_round_trip_to_algiers(seq):
        round_trip_packages.append(pkg)

print(f"Number of packages with Algiers → other → Algiers: {len(round_trip_packages)}")
print("Sample package IDs:", round_trip_packages[:10])


Number of packages with Algiers → other → Algiers: 8566
Sample package IDs: ['CA000623155BF', 'CA000644495PT', 'CA000750018PT', 'CA000838655PT', 'CA000915176PT', 'CA000979110PT', 'CA001036094US', 'CA001070351PT', 'CA001196799PT', 'CA001359775PT']


In [34]:
import pandas as pd

# Assume round_trip_packages is already defined as in previous step

# Take a random sample of 10 from those packages
import random
sample_10 = random.sample(round_trip_packages, 10)

# Get all the rows for these 10 packages
sample_rows = df_events[df_events["MAILITM_FID"].isin(sample_10)]

sample_rows


Unnamed: 0,MAILITM_FID,EVENT_TYPE_NM,date,établissement_postal,EVENT_TYPE_CD,next_établissement_postal,RECPTCL_FID,product_type,type_envoi,bp_nm,code_upw,normalized_upw
3630,CB969811042DE,Réception d'envoi du client (Srt),2024-10-08 16:51:00,ALLEMAGNE,1,,,Parcel Post,C,,,
3631,CB969811042DE,Insérer envoi dans sac (Srt),2024-10-09 05:00:00,ALLEMAGNE,8,,,Parcel Post,C,,,
3632,CB969811042DE,Expédition d'envoi à l'étranger (EDI-reçu),2024-10-10 12:30:00,ALLEMAGNE,12,ALGÉRIE,,Parcel Post,C,,,
3633,CB969811042DE,Recevoir envoi au bureau d'échange (Ent),2024-10-16 08:32:28,ALGER COLIS POSTAUX,30,,,Parcel Post,C,ALGER COLIS POSTAUX,16.0,16.0
3634,CB969811042DE,Expédier envoi à adresse nationale (Ent),2024-10-16 11:03:23,ALGER COLIS POSTAUX,35,DRARIA,,Parcel Post,C,ALGER COLIS POSTAUX,16.0,16.0
...,...,...,...,...,...,...,...,...,...,...,...,...
108419,CY561092653DE,Recevoir envoi au bureau de livraison (Ent),2022-08-15 09:06:35,CDD TLEMCEN,32,,,Parcel Post,C,CDD TLEMCEN,13.0,13.0
108420,CY561092653DE,Expédier envoi à adresse nationale (Srt),2022-08-15 09:09:21,CDD TLEMCEN,2,CHETOUANE-1ER-NOV,,Parcel Post,C,CDD TLEMCEN,13.0,13.0
108421,CY561092653DE,Recevoir envoi au bureau de livraison (Ent),2022-08-17 09:46:27,CHETOUANE-1ER-NOV,32,,,Parcel Post,C,CHETOUANE-1ER-NOV,13.0,13.0
108422,CY561092653DE,Vaine tentative de livraison d'envoi (Ent),2022-08-17 09:47:11,CHETOUANE-1ER-NOV,36,,,Parcel Post,C,CHETOUANE-1ER-NOV,13.0,13.0


In [1]:
import pandas as pd

# 1. Load your event and etablissement data
df_events = pd.read_csv("filtered_data_with_matching_id.csv")
df_etab = pd.read_csv("code_etablissement.csv")

# 2. Merge in code_upw from etab reference
df_events = df_events.merge(
    df_etab[["bp_nm", "code_upw"]],
    left_on="établissement_postal",
    right_on="bp_nm",
    how="left"
)

# 3. Make sure code_upw is integer
df_events["code_upw"] = pd.to_numeric(df_events["code_upw"], errors="coerce").astype("Int64")

# 4. The set representing Algiers UPW codes
algiers_set = {16, 76, 77}

def detect_true_return_to_algiers(upw_seq):
    """Returns True if package leaves Algiers, then returns to Algiers later."""
    left = False
    for upw in upw_seq:
        if pd.isna(upw):
            continue
        if not left:
            if upw not in algiers_set:
                left = True  # First time it leaves Algiers
        else:
            if upw in algiers_set:
                return True  # Returned after leaving
    return False

# 5. Apply to all packages
truly_returned = []
for pkg, group in df_events.groupby("MAILITM_FID"):
    seq = group.sort_values("date")["code_upw"].tolist()
    if detect_true_return_to_algiers(seq):
        truly_returned.append(pkg)

print("Number of packages with a true return to Algiers:", len(truly_returned))
print("Sample package IDs:", truly_returned[:10])

# 6. (Optional) Show all event rows for a sample of 10
import random
sample_pkgs = random.sample(truly_returned, min(10, len(truly_returned)))
sample_rows = df_events[df_events["MAILITM_FID"].isin(sample_pkgs)]
sample_rows.sort_values(["MAILITM_FID", "date"])


Number of packages with a true return to Algiers: 203
Sample package IDs: ['CA001888273PT', 'CA002466679PT', 'CA406826736DE', 'CA777836955DE', 'CB643615025DE', 'CC049139932NL', 'CC067391348NL', 'CC076349745FR', 'CC087523533NL', 'CC087524882NL']


Unnamed: 0,MAILITM_FID,EVENT_TYPE_NM,date,établissement_postal,EVENT_TYPE_CD,next_établissement_postal,RECPTCL_FID,bp_nm,code_upw
7881,CC097952157NL,Insérer envoi dans sac (Srt),2025-03-15 14:01:00,PAYS-BAS,8,,,,
7882,CC097952157NL,Expédition d'envoi à l'étranger (EDI-reçu),2025-03-18 18:30:00,PAYS-BAS,12,ALGÉRIE,,,
7883,CC097952157NL,Recevoir envoi au bureau d'échange (Ent),2025-03-22 08:55:11,ALGER COLIS POSTAUX,30,,,ALGER COLIS POSTAUX,16
7884,CC097952157NL,Expédier envoi à adresse nationale (Ent),2025-03-22 11:34:37,ALGER COLIS POSTAUX,35,ORAN COLIS POSTAUX,,ALGER COLIS POSTAUX,16
7885,CC097952157NL,Recevoir envoi au bureau de livraison (Ent),2025-03-24 09:59:02,ORAN EL M-NAOUER,32,,,ORAN EL M-NAOUER,31
...,...,...,...,...,...,...,...,...,...
117177,CY564054739DE,Recevoir envoi au bureau de livraison (Ent),2023-01-18 08:56:53,ORAN COLIS POSTAUX,32,,,ORAN COLIS POSTAUX,31
117178,CY564054739DE,Expédier envoi à adresse nationale (Srt),2023-01-18 09:34:17,ORAN COLIS POSTAUX,2,ALGER COLIS POSTAUX,,ORAN COLIS POSTAUX,31
117179,CY564054739DE,Expédier envoi à adresse nationale (Ent),2023-01-21 08:26:49,ALGER COLIS POSTAUX,35,AVION CPX ALGER,,ALGER COLIS POSTAUX,16
117180,CY564054739DE,Recevoir envoi au bureau d'échange (Srt),2023-01-21 09:20:46,ALGER COLIS POSTAUX,3,AVION CPX ALGER,,ALGER COLIS POSTAUX,16


In [None]:
df = pd.read_excel("input.xlsx")