In [1]:
"""
from CoLLM paper

ML-1M
preserve the interactions from the most recent twenty months, using the first 10 months for training, the middle 5 months for validation, and the last 5 months for testing
Train: 33,891
Valid: 10,401
Test: 7,331
User: 839
Item: 3,256


Amazon-Book dataset
preserve interactions from the year 2017 (including about 4 million interactions)
allocating the first 11 months for training, and the remaining two half months for validation and testing, respectively

filtered out users and items with fewer than 20 interactions to ensure data quality for measuring warm-start performance

Train: 727,468
Valid: 25,747
Test: 25,747
User: 22,967
Item: 34,154
"""

'\nfrom CoLLM paper\n\nML-1M\npreserve the interactions from the most recent twenty months, using the first 10 months for training, the middle 5 months for validation, and the last 5 months for testing\nTrain: 33,891\nValid: 10,401\nTest: 7,331\nUser: 839\nItem: 3,256\n\n\nAmazon-Book dataset\npreserve interactions from the year 2017 (including about 4 million interactions)\nallocating the first 11 months for training, and the remaining two half months for validation and testing, respectively\n\nfiltered out users and items with fewer than 20 interactions to ensure data quality for measuring warm-start performance\n\nTrain: 727,468\nValid: 25,747\nTest: 25,747\nUser: 22,967\nItem: 34,154\n'

In [2]:
import os
from pathlib import Path
import requests

import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset
# from torch_geometric.data import Dataset, download_url

from datasets import load_dataset

from src.data.utils import loadFileFromURL
from src.utils.wrapper import timeMeasured
            

class AmazonDataset(Dataset):
    """
    Dataset class for the Amazon Review Dataset from 2023.
    Overview: https://amazon-reviews-2023.github.io/main.html
    """
    
    def __init__(self, root, category="Books", interactionDataUrl=None):
        """
        Parameters:
            root (str): root dir of dataset
            category (str): category as e.g. 'Books'
            interactionDataUrl (str): base url of the interaction data, default is: 5core and timestamp_w_his
        """
        self.root = root
        self.category = category
        self.interactionDataUrl = interactionDataUrl or "https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_2023/benchmark/5core/timestamp_w_his"
        self.rawDataDir = Path(self.root) / "raw"
        self.rawDataDir.mkdir(parents=True, exist_ok=True)

        self.downloadInteractionData()
        self.downloadItemData()
    
    @timeMeasured
    def downloadInteractionData(self):
        for split in ["train", "valid", "test"]:
            datasetFilename = f"{self.category}.{split}.csv.gz"
            datasetPath = self.rawDataDir / "Interactions" / datasetFilename
            datasetPath.parent.mkdir(parents=True, exist_ok=True)

            if not datasetPath.exists():
                datasetUrl = f"{self.interactionDataUrl}/{datasetFilename}"
                loadFileFromURL(datasetUrl, datasetPath)
    
    @timeMeasured
    def downloadItemData(self):
        datasetFilename = f"{self.category}ItemMetadata.csv.gz"
        datasetPath = self.rawDataDir / "Items" / datasetFilename
        datasetPath.parent.mkdir(parents=True, exist_ok=True)

        if not datasetPath.exists():
            try:
                itemInformation = load_dataset(
                    "McAuley-Lab/Amazon-Reviews-2023",
                    f"raw_meta_{self.category}",
                    split="full",
                    trust_remote_code=True
                )
                dataframe = pd.DataFrame.from_records(itemInformation)
                dataframe.to_csv(datasetPath, index=False, compression="gzip")
                print(f"File downloaded and saved to {datasetPath}")
            except Exception as e:
                print(f"Failed to download file. The following exception occured: {e}")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
'''root="data/Test"
os.makedirs(root, exist_ok=True)
category = "Books"
interactionDataUrl = "https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_2023/benchmark/5core/timestamp_w_his"

AmazonDataset(root, category, interactionDataUrl)'''

'root="data/Test"\nos.makedirs(root, exist_ok=True)\ncategory = "Books"\ninteractionDataUrl = "https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_2023/benchmark/5core/timestamp_w_his"\n\nAmazonDataset(root, category, interactionDataUrl)'

In [None]:
import os
from pathlib import Path
from urllib.parse import urljoin
import requests
import pprint
import gzip
import json

import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset
# from torch_geometric.data import Dataset, download_url

from datasets import load_dataset

from src.data.utils import loadFileFromURL
from src.utils.wrapper import tryExcept, timeMeasured


class AmazonDataset(Dataset):
    """
    Dataset class for the Amazon Review Dataset from 2023.
    Overview: https://amazon-reviews-2023.github.io/main.html
    """
    
    def __init__(self, root, datasetConfig, datasetName):
        self.root = root
        self.datasetConfig = datasetConfig
        self.datasetName = datasetName
        
        with open(datasetConfig, "r") as configFile:
            configData = json.load(configFile)
            self.datasetConfig = configData.get(datasetName, {})
            print(f"Dataset Config set as:")
            pprint.pp(self.datasetConfig)
        self.category = self.datasetConfig.get("category")
        urls = self.datasetConfig.get("urls", {})
        self.interactionDataUrl = urls.get("interactionDataUrl", "default_interaction_data_url")
        self.metaDataUrl = urls.get("metaDataUrl", "default_meta_data_url")
        self.reviewDataUrl = urls.get("reviewDataUrl", "default_review_data_url")
        
        self.rawDataDir = Path(self.root) / "raw"
        self.rawDataDir.mkdir(parents=True, exist_ok=True)

        self.downloadInteractionData()
        self.downloadItemDataAsJSON()
        self.unwrapItemData(self.rawMetaDatasetPath)
    
    @tryExcept
    @timeMeasured
    def downloadInteractionData(self):
        for split in ["train", "valid", "test"]:
            datasetFilename = f"{self.category}.{split}.csv.gz"
            datasetPath = self.rawDataDir / "Interactions" / datasetFilename
            datasetPath.parent.mkdir(parents=True, exist_ok=True)
            if not datasetPath.exists():
                datasetUrl = urljoin(self.interactionDataUrl, datasetFilename)
                loadFileFromURL(datasetUrl, datasetPath)
    
    @tryExcept
    @timeMeasured
    def downloadItemDataAsJSON(self):
        metaDataFilename = f"meta_{self.category}.jsonl.gz"
        self.rawMetaDatasetPath = self.rawDataDir / "Items" / metaDataFilename
        self.rawMetaDatasetPath.parent.mkdir(parents=True, exist_ok=True)
        if not self.rawMetaDatasetPath.exists():
            loadFileFromURL(urljoin(self.metaDataUrl, metaDataFilename), self.rawMetaDatasetPath)
        
        reviewDataFilename = f"{self.category}.jsonl.gz"
        self.rawReviewDatasetPath = self.rawDataDir / "Items" / reviewDataFilename
        self.rawReviewDatasetPath.parent.mkdir(parents=True, exist_ok=True)
        if not self.rawReviewDatasetPath.exists():
            loadFileFromURL(urljoin(self.reviewDataUrl, reviewDataFilename), self.rawReviewDatasetPath)
    
    @tryExcept
    @timeMeasured
    def downloadItemDataFromHF(self):
        datasetFilename = f"{self.category}ItemMetadata.csv.gz"
        datasetPath = self.rawDataDir / "Items" / datasetFilename
        datasetPath.parent.mkdir(parents=True, exist_ok=True)
        if not datasetPath.exists():
            itemInformation = load_dataset(
                "McAuley-Lab/Amazon-Reviews-2023",
                f"raw_meta_{self.category}",
                split="full",
                trust_remote_code=True
            )
            dataframe = pd.DataFrame.from_records(itemInformation)
            dataframe.to_csv(datasetPath, index=False, compression="gzip")
            print(f"File downloaded and saved to {datasetPath}")
    
    def checkRequiredFields(self, jsonLine):
        """
        Checks whether the required fields are present based on the config.
        Returns True if all required fields are valid; False otherwise.
        """
        requiredFields = self.datasetConfig.get("required_fields", {})
        for field, isRequired in requiredFields.items():
            if isRequired:
                value = jsonLine.get(field)
                # For description and images, we ensure they are non-empty lists
                if field == "description" or field == "images":
                    if not isinstance(value, list) or not value:
                        return False
                # For other fields, just check if they are truthy (non-null, non-empty)
                elif not value:
                    return False
        return True
    
    @tryExcept
    @timeMeasured
    def unwrapItemData(self, datasetPath):
        rawUnwrappedDataDir = self.rawDataDir / "Items" / "Unwrapped" / self.datasetName
        os.makedirs(rawUnwrappedDataDir, exist_ok=True)
        
        with gzip.open(datasetPath, "rt", encoding="utf-8") as f:
            linesCount, self.dumpedJSONlist = 0, []
            for line in f:
                linesCount += 1
                jsonLine = json.loads(line.strip())
                if self.checkRequiredFields(jsonLine):  # Only save if required fields are valid
                    parent_asin = jsonLine.get("parent_asin")
                    outputFilePath = rawUnwrappedDataDir / f"{parent_asin}.json"
                    with open(outputFilePath, "w", encoding="utf-8") as outputFile:
                        json.dump(jsonLine, outputFile, indent=4)
                    self.dumpedJSONlist.append(parent_asin)
                if linesCount == 100:
                    break
        
        print(f"Unwrapped dataset from {datasetPath}")
        print(f"Saved {len(self.dumpedJSONlist)} from a total of {linesCount} lines.")


In [None]:
root="data/Test"
os.makedirs(root, exist_ok=True)
datasetConfig = "src/data/datasetConfigAmazon.json"
datasetName = "AmazonAllBeautyDataset"

AmazonBeautyDataset = AmazonDataset(root, datasetConfig, datasetName)

Dataset Config set as:
{'category': 'All_Beauty',
 'urls': {'interactionDataUrl': 'https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_2023/benchmark/5core/timestamp_w_his/',
          'metaDataUrl': 'https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_2023/raw/meta_categories/',
          'reviewDataUrl': 'https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_2023/raw/review_categories/'},
 'required_fields': {'parent_asin': True,
                     'description': True,
                     'images': True,
                     'title': True,
                     'average_rating': False,
                     'features': False,
                     'price': False,
                     'details': False}}
downloadInteractionData executed in 0 hours, 0 minutes, 0 seconds.

downloadItemDataAsJSON executed in 0 hours, 0 minutes, 0 seconds.

Unwrapped dataset from data/Test/raw/Items/meta_All_Beauty.jsonl.gz
Saved 29 from a total of 100 lines.
unwrapItemData executed in 0 hours, 0