# Install Dependencies

In [1]:
!pip -q install datasets
!pip -q install unidecode

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf-cu12 24.4.1 requires pyarrow<15.0.0a0,>=14.0.1, but you have pyarrow 16.1.0 whic

In [2]:
import datasets
from unidecode import unidecode
import csv
import datetime

datasets.logging.set_verbosity_error()

# Simple POC

In [4]:
dataset = datasets.load_dataset(
    "McAuley-Lab/Amazon-Reviews-2023",
    f"raw_meta_Electronics",
    split="full",
    trust_remote_code=True,
    streaming=True
)

# dataset = dataset.shuffle(seed=42, buffer_size=1_000)
dataset = dataset.take(10)

for i, item in enumerate(dataset):
    print(item)
    break

{'main_category': 'All Electronics', 'title': 'FS-1051 FATSHARK TELEPORTER V3 HEADSET', 'average_rating': 3.5, 'rating_number': 6, 'features': [], 'description': ['Teleporter V3 The “Teleporter V3” kit sets a new level of value in the FPV world with Fat Shark renowned performance and quality. The fun of FPV is experienced firsthand through the large screen FPV headset with integrated NexwaveRF receiver technology while simultaneously recording onboard HD footage with the included “PilotHD” camera. The “Teleporter V3” kit comes complete with everything you need to step into the cockpit of your FPV vehicle. We’ve included our powerful 250mW 5.8Ghz transmitter, 25 degree FOV headset (largest QVGA display available), the brand new “PilotHD” camera with live AV out and all the cables, antennas and connectors needed.'], 'price': 'None', 'images': {'hi_res': [None], 'large': ['https://m.media-amazon.com/images/I/41qrX56lsYL._AC_.jpg'], 'thumb': ['https://m.media-amazon.com/images/I/41qrX56lsY

In [15]:
from random import randrange

print(randrange(4000))

1814


# Write `init.sql` - don't use this

In [None]:
# Number of records per category.
NUM_RECORDS = 10_000

# Constants
SQL_INSERT = "INSERT INTO public.\"Listings\" (seller_id, title, price, location, postal_code, status, category) VALUES\n"
LOCATION = "'POINT(48.378400 -123.415600)'::GEOMETRY"
POSTAL_CODE = "'V8R6N2'"
STATUS = "'AVAILABLE'"

# from https://huggingface.co/datasets/McAuley-Lab/Amazon-Reviews-2023/tree/main/raw/meta_categories
categories = [
    'Appliances',
    'Automotive',
    'Beauty_and_Personal_Care',
    'Cell_Phones_and_Accessories',
    'Clothing_Shoes_and_Jewelry',
    'Electronics',
    'Health_and_Household',
    'Home_and_Kitchen',
    'Industrial_and_Scientific',
    'Musical_Instruments',
    'Office_Products',
    'Patio_Lawn_and_Garden',
    'Sports_and_Outdoors',
    'Tools_and_Home_Improvement',
    'Video_Games'
]

# Create new file - this will overwrite existing files.
f = open('./insert.sql', 'w+')


for category in categories:

    # stream dataset
    dataset = datasets.load_dataset(
        "McAuley-Lab/Amazon-Reviews-2023",
        f"raw_meta_{category}",
        split="full",
        trust_remote_code=True,
        streaming=True
    )

    # shuffle dataset and grab 100 items
    # dataset = dataset.shuffle(seed=42, buffer_size=1_000)
    dataset = dataset.take(NUM_RECORDS)

    print(f"Writing [{category}]...\n")

    f.write(SQL_INSERT) # start with insert statement

    for i, item in enumerate(dataset):
        if all(x not in [item['title'], item['price'], item['main_category']] for x in ['None', None, '']):
            line = f"({(i%20) + 1}, $${unidecode(item['title'])}$$, {item['price']}, {LOCATION}, {POSTAL_CODE}, {STATUS}, '{item['main_category']}')"

            if i > 0:
                f.write(',\n' + line)
            else:
                f.write(line)

    f.write(';\n')

    f.write('\n\n') # add new lines

f.close()

# Write `data.csv` - use this instead

In [20]:
from random import randrange

# Number of records per category.
NUM_RECORDS = 20_000

# Constants
# SQL_INSERT = "INSERT INTO public.\"Listings\" (seller_id, title, price, location, postal_code, status, listed_at, last_updated_at, category) VALUES\n"
FIELDS = ['seller_id', 'title', 'price', 'latitude', 'longitude', 'postal_code', 'status', 'listed_at', 'last_updated_at', 'category']
LATITUDE = 48.378400
LONGITUDE = -123.415600
POSTAL_CODE = 'V8R6N2'
STATUS = 'AVAILABLE'
TIMESTAMP=datetime.datetime(2024, 7, 1)



# from https://huggingface.co/datasets/McAuley-Lab/Amazon-Reviews-2023/tree/main/raw/meta_categories
categories = [
    'Appliances',
    'Automotive',
    'Beauty_and_Personal_Care',
    'Cell_Phones_and_Accessories',
    'Clothing_Shoes_and_Jewelry',
    'Electronics',
    'Health_and_Household',
    'Home_and_Kitchen',
    'Industrial_and_Scientific',
    'Musical_Instruments',
    'Office_Products',
    'Patio_Lawn_and_Garden',
    'Sports_and_Outdoors',
    'Tools_and_Home_Improvement',
    'Video_Games'
]

# Create csv file - this will overwrite existing files.
csv_file = open('./listings.csv', 'w+')

writer = csv.DictWriter(csv_file, fieldnames=FIELDS)
writer.writeheader()

for category in categories:

    # stream dataset
    dataset = datasets.load_dataset(
        "McAuley-Lab/Amazon-Reviews-2023",
        f"raw_meta_{category}",
        split="full",
        trust_remote_code=True,
        streaming=True
    )

    # shuffle dataset and grab 100 items
    # dataset = dataset.shuffle(seed=42, buffer_size=1_000)
    dataset = dataset.take(NUM_RECORDS)

    print(f"Writing [{category}]...\n")
    for i, item in enumerate(dataset):
        try:
            float(item['price'])
            if all(x not in [item['title'], item['price'], item['main_category']] for x in ['None', None, '']):
                row = [{
                    'seller_id': randrange(1, 4021),
                    'title': unidecode(item['title']),
                    'price': item['price'],
                    'latitude': LATITUDE,
                    'longitude': LONGITUDE,
                    'postal_code': POSTAL_CODE,
                    'status': STATUS,
                    'listed_at': TIMESTAMP,
                    'last_updated_at': TIMESTAMP,
                    'category': item['main_category']
                }]

                writer.writerows(row)
        except :
            continue

csv_file.close()

Writing [Appliances]...

Writing [Automotive]...

Writing [Beauty_and_Personal_Care]...

Writing [Cell_Phones_and_Accessories]...

Writing [Clothing_Shoes_and_Jewelry]...

Writing [Electronics]...

Writing [Health_and_Household]...

Writing [Home_and_Kitchen]...

Writing [Industrial_and_Scientific]...

Writing [Musical_Instruments]...

Writing [Office_Products]...

Writing [Patio_Lawn_and_Garden]...

Writing [Sports_and_Outdoors]...

Writing [Tools_and_Home_Improvement]...

Writing [Video_Games]...



# Download file

In [21]:
from google.colab import files
# files.download('insert.sql')
files.download('listings.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>