In [None]:
import pandas as pd
from typing import List, Dict

In [None]:
USD_TO_IDR = 16000

In [None]:
def _to_dataframe(data: List[Dict]) -> pd.DataFrame:
    """
    Mengubah list of dict hasil extract menjadi DataFrame
    """
    try:
        return pd.DataFrame(data)
    except Exception as e:
        raise ValueError(f"Failed to convert data to DataFrame: {e}")

In [None]:
def clean_price(df: pd.DataFrame) -> pd.DataFrame:
    """
    Membersihkan kolom price:
    - Ambil nilai numerik
    - Konversi USD ke IDR (kurs 16.000)
    """
    try:
        df["price"] = (
            df["price"]
            .astype(str)
            .str.extract(r"(\d+\.?\d*)")
            .astype(float)
            * USD_TO_IDR
        )
        return df
    except Exception as e:
        raise ValueError(f"Price transformation failed: {e}")

In [None]:
def clean_rating(df: pd.DataFrame) -> pd.DataFrame:
    """
    Membersihkan kolom rating:
    - Ambil nilai numerik
    - Konversi ke float
    - Rating invalid akan menjadi NaN
    """
    try:
        df["rating"] = (
            df["rating"]
            .astype(str)
            .str.extract(r"(\d+\.?\d*)")
            .astype(float)
        )
        return df
    except Exception as e:
        raise ValueError(f"Rating transformation failed: {e}")

In [None]:
def clean_colors(df: pd.DataFrame) -> pd.DataFrame:
    """
    Membersihkan kolom colors:
    - Ambil angka saja
    """
    try:
        df["colors"] = (
            df["colors"]
            .astype(str)
            .str.extract(r"(\d+)")
            .astype(int)
        )
        return df
    except Exception as e:
        raise ValueError(f"Colors transformation failed: {e}")

In [None]:
def clean_size(df: pd.DataFrame) -> pd.DataFrame:
    """
    Membersihkan kolom size:
    - Hapus teks 'Size:' jika ada
    """
    try:
        df["size"] = (
            df["size"]
            .astype(str)
            .str.replace("Size:", "", regex=False)
            .str.strip()
        )
        return df
    except Exception as e:
        raise ValueError(f"Size transformation failed: {e}")

In [None]:
def clean_gender(df: pd.DataFrame) -> pd.DataFrame:
    """
    Membersihkan kolom gender:
    - Hapus teks 'Gender:' jika ada
    """
    try:
        df["gender"] = (
            df["gender"]
            .astype(str)
            .str.replace("Gender:", "", regex=False)
            .str.strip()
        )
        return df
    except Exception as e:
        raise ValueError(f"Gender transformation failed: {e}")

In [None]:
def remove_invalid_and_duplicate(df: pd.DataFrame) -> pd.DataFrame:
    """
    Menghapus:
    - nilai null
    - duplikat
    - product tidak valid
    """
    try:
        df = df.dropna()
        df = df.drop_duplicates()
        df = df[df["title"] != "Unknown Product"]
        return df
    except Exception as e:
        raise ValueError(f"Data filtering failed: {e}")

In [None]:
def transform_products(data: List[Dict]) -> pd.DataFrame:
    """
    Pipeline transformasi lengkap:
    - list â†’ DataFrame
    - cleaning tiap kolom
    - hapus invalid & duplikat
    """
    df = _to_dataframe(data)

    df = clean_price(df)
    df = clean_rating(df)
    df = clean_colors(df)
    df = clean_size(df)
    df = clean_gender(df)
    df = remove_invalid_and_duplicate(df)

    return df

In [None]:
import sys
sys.path.insert(0, "../utils")

import extract
import json

data = extract.scrape_all_products()
# print(json.dumps(data[0], indent=2, ensure_ascii=False))

df = transform_products(data)
