In [None]:
from huggingface_hub import login
import os 
from dotenv import load_dotenv #importing env file


In [None]:
load_dotenv()
hf_token =os.environ["HF_TOKEN"]
login(hf_token, add_to_git_credential=True)



In [None]:
#importing src in order to import items 
import sys
import os

# Adding src to Python path
sys.path.append(os.path.join(os.getcwd(), '..', 'src'))

from items import Item

In [None]:
from datetime import datetime
from tqdm import tqdm
from datasets import load_dataset


# We'll only process items within this price range.
MIN_PRICE = 0.50
MAX_PRICE = 999.49

def process_products_for_category(category_name):
    """
    Loads raw product data for a category, filters out unwanted items,
    and returns a clean list of Item objects.
    """
    start_time = datetime.now()

    # Load the dataset from the hub.
    try:
        dataset = load_dataset(
            "McAuley-Lab/Amazon-Reviews-2023",
            f"raw_meta_{category_name}",
            split="full",
            trust_remote_code=True
        )
    except Exception as e:
        return []

    clean_items = []
    # Loop through and validate each item.
    for data_point in tqdm(dataset, desc="   - Filtering"):
        try:
            price_str = data_point.get('price')
            if not price_str:
                continue

            price = float(price_str)

            if MIN_PRICE <= price <= MAX_PRICE:
                item = Item(data_point, price)
                if item.include:
                    item.category = category_name
                    clean_items.append(item)

        except (ValueError, TypeError):
            # Price not a valid number-skip it.
            continue

    end_time = datetime.now()
    duration_minutes = (end_time - start_time).total_seconds() / 60

    print(f"{len(clean_items):,} items for '{category_name}'.")
    print(f"time: {duration_minutes:.1f} minutes.")

    return clean_items





In [None]:
items = process_products_for_category('Appliances')


In [None]:
dataset_names=[
    # "Automotive",
    # "Electronics",
    # "Office_Products",
    # "Tools_and_Home_Improvement",
    # "Cell_Phones_and_Accessories",
    "Toys_and_Games",
    "Appliances",
    "Musical_Instruments",
    "Health_and_Personal_Care.jsonl",
    "Video_Games" ,
    "Grocery_and_Gourmet_Food",
    "All_Beauty",
]

In [None]:
items=[]
for dataset_name in dataset_names:
    loader = process_products_for_category(dataset_name)
    items.extend(loader)

In [None]:
print(f"Total No of items is :{len(items)}")