In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [1]:
import pandas as pd
import io

FILE_PATH = "EXPORT_DATA_envoi_01_01_2022_30_04_2025.csv"
TARGET_BYTES = 5 * 1024**3          # 1 GB
ENCODING = "utf-8-sig"              # <- handles UTF-8 + BOM correctly

def load_first_gb(path, target_bytes=TARGET_BYTES, **read_kwargs):
    """Return a DataFrame containing (a little under) `target_bytes` worth of data."""
    buffers = []
    bytes_so_far = 0

    with open(path, "rb") as f:             # binary mode so we can count raw bytes
        header = f.readline()               # keep header line
        bytes_so_far += len(header)
        buffers.append(header)

        # stream line by line until we cross the target
        for line in f:
            bytes_so_far += len(line)
            if bytes_so_far > target_bytes:
                break
            buffers.append(line)

    # glue the captured bytes into an in-memory file-like object
    pseudo_file = io.BytesIO(b"".join(buffers))

    df = pd.read_csv(
        pseudo_file,
        sep=";",
        encoding=ENCODING,
        on_bad_lines="skip",
        dtype=str,
        low_memory=False,
        **read_kwargs,
    )

    # nuke any BOM that sneaked into the first column name
    df.columns = df.columns.str.lstrip("\ufeff")

    return df

# --------- grab the first ≈1 GB ---------
df = load_first_gb(FILE_PATH)
print(df.shape)
df.head()


(42748735, 7)


Unnamed: 0,RECPTCL_FID,MAILITM_FID,EVENT_TYPE_NM,date,établissement_postal,EVENT_TYPE_CD,next_établissement_postal
0,,CC021659473ES,Insérer envoi dans sac (Srt),2024-01-31 20:11:00.000,ESPAGNE,8,
1,,CC021662747ES,Réception d'envoi du client (Srt),2024-01-31 12:31:00.000,ESPAGNE,1,
2,,CC021662747ES,Insérer envoi dans sac (Srt),2024-02-05 12:46:00.000,ESPAGNE,8,
3,,CC021663328ES,Réception d'envoi du client (Srt),2024-01-31 13:51:00.000,ESPAGNE,1,
4,,CC021663328ES,Insérer envoi dans sac (Srt),2024-02-01 12:32:00.000,ESPAGNE,8,


In [21]:
df = pd.read_csv("df_with_durations_01.csv", parse_dates=["date"])

  df = pd.read_csv("df_with_durations_01.csv", parse_dates=["date"])


In [22]:
df.head()

Unnamed: 0,RECPTCL_FID,MAILITM_FID,EVENT_TYPE_NM,date,établissement_postal,EVENT_TYPE_CD,next_établissement_postal,id,duration_to_next_step,total_duration
0,,CA000020800LY,Insérer envoi dans sac (Srt),2022-02-03 10:19:00.000,,8,,LY,6 days 22:15:00,6 days 22:15:00
1,,CA000020800LY,Insérer envoi dans sac (Srt),2022-02-10 08:34:00.000,"LIBYENNE, JAMAHIRIYA ARABE",8,,LY,,6 days 22:15:00
2,,CA000020844RU,Réception d'envoi du client (Srt),2024-12-10 16:40:00.000,"RUSSIE, FÉDÉRATION DE",1,,RU,,0 days 00:00:00
3,,CA000086085US,Réception d'envoi du client (Srt),2024-11-16 00:31:00.000,ÉTATS-UNIS,1,,US,8 days 08:07:00,8 days 08:07:00
4,,CA000086085US,Insérer envoi dans sac (Srt),2024-11-24 08:38:00.000,ÉTATS-UNIS,8,,US,,8 days 08:07:00


In [2]:
# Define columns to ignore when checking duplicates
cols_ignore = ["RECPTCL_FID", "duration_to_next_step", "total_duration", "id"]
cols_check  = [c for c in df.columns if c not in cols_ignore]

# 1. Identify all duplicate rows based on cols_check
dup_mask = df.duplicated(subset=cols_check, keep=False)

# 2. Number of such duplicate rows
num_duplicates = dup_mask.sum()
print(f"Number of duplicate rows (ignoring specified columns): {num_duplicates}")




Number of duplicate rows (ignoring specified columns): 25235
