In [None]:
# ---------- Helpers ----------
def load_data():
    import pandas as pd
    train = pd.read_csv(TRAIN_CSV)
    test  = pd.read_csv(TEST_CSV)
    return train, test

def quick_inspect(df, name="dataframe", n=5):
    print(f"--- {name} shape: {df.shape} ---")
    display(df.head(n))
    print(df.info())
    print("Null counts:")
    print(df.isnull().sum())
    print("sample_id duplicates:", df['sample_id'].duplicated().sum())
    if 'price' in df.columns:
        print("price summary:")
        print(df['price'].describe())

In [None]:
# ---------- Text cleaning ----------
def clean_text(s):
    import pandas as pd
    import re
    if pd.isna(s):
        return ""
    s = str(s)
    s = s.replace("\n", " ").replace("\r"," ")
    # remove HTML tags
    s = re.sub(r"<[^>]+>", " ", s)
    # remove URLs
    s = re.sub(r"http\S+|www\.\S+", " ", s)
    # normalize unicode weirdness
    s = re.sub(r"[\u2000-\u206F\u2E00-\u2E7F\\'\"()]", " ", s)
    # keep letters, numbers, common punctuation .,%- and spaces
    s = re.sub(r"[^A-Za-z0-9\.\,\%\-\s/]", " ", s)
    # unify whitespace
    s = re.sub(r"\s+", " ", s).strip()
    return s.lower()

In [None]:
# ---------- Image download with retry ----------
def download_image(url, dest_path, timeout=8, max_retries=3):
    headers = {"User-Agent": "Mozilla/5.0 (compatible)"}
    for attempt in range(max_retries):
        try:
            resp = requests.get(url, stream=True, timeout=timeout, headers=headers)
            resp.raise_for_status()
            # verify small image open
            img = Image.open(BytesIO(resp.content)).convert("RGB")
            img.save(dest_path, format="JPEG", quality=85)
            return {"ok": True, "size": dest_path.stat().st_size}
        except Exception as e:
            time.sleep(1 + attempt*1.5)
            last_err = str(e)
    return {"ok": False, "error": last_err}

In [None]:
# ---------- Main cleaning function ----------
def clean_dataset(train, test, download_images=False, max_images=None):
    # 1. basic inspection
    quick_inspect(train, "train (raw)")
    quick_inspect(test, "test (raw)")

    # 2. sample_id uniqueness
    if train['sample_id'].duplicated().any():
        print("Warning: duplicated sample_id in train - keeping first occurrence.")
        train = train.drop_duplicates(subset=['sample_id'], keep='first')
    if test['sample_id'].duplicated().any():
        test = test.drop_duplicates(subset=['sample_id'], keep='first')

    # 3. clean text
    print("Cleaning catalog_content ...")
    train['catalog_content_clean'] = train['catalog_content'].apply(clean_text)
    test['catalog_content_clean']  = test['catalog_content'].apply(clean_text)

    # 4. extract item pack qty
    print("Extracting item_pack_qty ...")
    train['item_pack_qty'] = train['catalog_content_clean'].apply(extract_ipq)
    test['item_pack_qty']  = test['catalog_content_clean'].apply(extract_ipq)

    # 5. handle missing images
    for df,name in [(train,"train"), (test,"test")]:
        df['image_link'] = df['image_link'].fillna("")
        df['has_image_url'] = df['image_link'].apply(lambda x: bool(str(x).strip()))

    # 6. handle missing price in train (drop and log)
    if train['price'].isnull().any():
        nnull = train['price'].isnull().sum()
        print(f"Found {nnull} null prices in train — dropping those rows.")
        train = train[~train['price'].isnull()].copy()

    # 7. outlier clipping + log transform option
    print("Clipping extreme prices and creating price_log ...")
    train, low, high = clip_prices(train)
    train['price_log'] = np.log1p(train['price_clipped'])

    # 8. short catalog / empty handling - keep flag
    for df,name in [(train,"train"), (test,"test")]:
        df['catalog_len'] = df['catalog_content_clean'].apply(lambda x: len(str(x).split()))
        df['catalog_short'] = df['catalog_len'] < 3  # flag very short descriptions

    # 9. image manifest + optional download (do in batches)
    manifest_rows = []
    def manifest_row(row, idx):
        url = row['image_link']
        sid = row['sample_id']
        dest = IMG_DIR / f"{sid}.jpg"
        if url.strip()=="":
            return {"sample_id": sid, "image_link": url, "has_image": False, "path": "", "ok": False, "error": "no-url"}
        if download_images:
            r = download_image(url, dest)
            return {"sample_id": sid, "image_link": url, "has_image": r['ok'], "path": str(dest) if r['ok'] else "", "ok": r['ok'], "error": r.get('error','')}
        else:
            return {"sample_id": sid, "image_link": url, "has_image": True, "path": str(dest), "ok": None, "error": ""}

    print("Creating image manifest (no download unless download_images=True)...")
    # Merge train + test sample ids but keep order: train then test if desired
    for df in [train.head(max_images) if max_images else train, test.head(max_images) if max_images else test]:
        for idx, row in tqdm(df.iterrows(), total=len(df)):
            manifest_rows.append(manifest_row(row, idx))
    manifest = pd.DataFrame(manifest_rows)
    manifest.to_csv(IMAGE_MANIFEST, index=False)

    # 10. save cleaned datasets
    train.to_csv(TRAIN_OUT, index=False)
    test.to_csv(TEST_OUT, index=False)
    print("Saved cleaned files:", TRAIN_OUT, TEST_OUT)
    print("Saved image manifest:", IMAGE_MANIFEST)
    return train, test, manifest


In [None]:
def clip_prices(df, lower_quantile=0.001, upper_quantile=0.999):
    """
    Clips extreme prices based on quantiles and adds a clipped price column.
    Logs the lower and upper bounds used for clipping.
    """
    if 'price' not in df.columns:
        print("Warning: 'price' column not found for clipping.")
        return df, None, None

    low = df['price'].quantile(lower_quantile)
    high = df['price'].quantile(upper_quantile)
    print(f"Clipping prices: lower bound = {low:.2f}, upper bound = {high:.2f}")
    df['price_clipped'] = df['price'].clip(lower=low, upper=high)
    return df, low, high

In [None]:
# ---------- Main cleaning function ----------
def clean_dataset(train, test, download_images=False, max_images=None):
    import numpy as np # Import numpy here
    # 1. basic inspection
    quick_inspect(train, "train (raw)")
    quick_inspect(test, "test (raw)")

    # 2. sample_id uniqueness
    if train['sample_id'].duplicated().any():
        print("Warning: duplicated sample_id in train - keeping first occurrence.")
        train = train.drop_duplicates(subset=['sample_id'], keep='first').copy()
    if test['sample_id'].duplicated().any():
        test = test.drop_duplicates(subset=['sample_id'], keep='first').copy()

    # 3. clean text
    print("Cleaning catalog_content ...")
    train['catalog_content_clean'] = train['catalog_content'].apply(clean_text)
    test['catalog_content_clean']  = test['catalog_content'].apply(clean_text)

    # 4. extract item pack qty
    print("Extracting item_pack_qty ...")
    train['item_pack_qty'] = train['catalog_content_clean'].apply(extract_ipq)
    test['item_pack_qty']  = test['catalog_content_clean'].apply(extract_ipq)

    # 5. handle missing images
    for df,name in [(train,"train"), (test,"test")]:
        df['image_link'] = df['image_link'].fillna("")
        df['has_image_url'] = df['image_link'].apply(lambda x: bool(str(x).strip()))

    # 6. handle missing price in train (drop and log)
    if train['price'].isnull().any():
        nnull = train['price'].isnull().sum()
        print(f"Found {nnull} null prices in train — dropping those rows.")
        train = train[~train['price'].isnull()].copy()

    # 7. outlier clipping + log transform option
    print("Clipping extreme prices and creating price_log ...")
    train, low, high = clip_prices(train)
    train['price_log'] = np.log1p(train['price_clipped'])

    # 8. short catalog / empty handling - keep flag
    for df,name in [(train,"train"), (test,"test")]:
        df['catalog_len'] = df['catalog_content_clean'].apply(lambda x: len(str(x).split()))
        df['catalog_short'] = df['catalog_len'] < 3  # flag very short descriptions

    # 9. image manifest + optional download (do in batches)
    manifest_rows = []
    def manifest_row(row, idx):
        url = row['image_link']
        sid = row['sample_id']
        dest = IMG_DIR / f"{sid}.jpg"
        if url.strip()=="":
            return {"sample_id": sid, "image_link": url, "has_image": False, "path": "", "ok": False, "error": "no-url"}
        if download_images:
            r = download_image(url, dest)
            return {"sample_id": sid, "image_link": url, "has_image": r['ok'], "path": str(dest) if r['ok'] else "", "ok": r['ok'], "error": r.get('error','')}
        else:
            return {"sample_id": sid, "image_link": url, "has_image": True, "path": str(dest), "ok": None, "error": ""}

    print("Creating image manifest (no download unless download_images=True)...")
    # Merge train + test sample ids but keep order: train then test if desired
    for df in [train.head(max_images) if max_images else train, test.head(max_images) if max_images else test]:
        for idx, row in tqdm(df.iterrows(), total=len(df)):
            manifest_rows.append(manifest_row(row, idx))
    manifest = pd.DataFrame(manifest_rows)
    manifest.to_csv(IMAGE_MANIFEST, index=False)

    # 10. save cleaned datasets
    train.to_csv(TRAIN_OUT, index=False)
    test.to_csv(TEST_OUT, index=False)
    print("Saved cleaned files:", TRAIN_OUT, TEST_OUT)
    print("Saved image manifest:", IMAGE_MANIFEST)
    return train, test, manifest

In [None]:
# ---------- Run cleaning ----------
if __name__ == "__main__":
    train, test = load_data()
    # For initial run we do NOT download images: set download_images=True in a later cell if desired
    train_cleaned, test_cleaned, manifest = clean_dataset(train, test, download_images=False, max_images=None)
    print("Done Step -1 cleaning.")