In [221]:
import matplotlib.pyplot as plt
import json
import re

from datasets import load_dataset
from typing import Optional
from transformers import AutoTokenizer

%matplotlib inline

In [3]:
dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_Appliances", split="full")
data = []
                       
for datapoint in dataset:
    if datapoint['price'] != 'None':
        data.append(datapoint)
        

In [4]:
print(len(data))

46726


In [17]:
keys_to_keep = ['title', 'description', 'details', 'features', 'price']
data = [{k: datapoint[k] for k in datapoint if k in keys_to_keep} for datapoint in dataset if datapoint['price'] != 'None']

In [43]:
print(len(data))
print(data[0].keys())
for k, v in data[0].items():
    if k == 'details':
        try:
            v_dict = json.loads(v)
            print(f"{k}: ")
            for detail_key, detail_value in v_dict.items():
                print(f"\t{detail_key}: \n\t\t{detail_value}")
            print()
        except ValueError as e:
            print(e)
    else:
        print(f"{k}: \n\t{v}\n")

46726
dict_keys(['title', 'features', 'description', 'price', 'details'])
title: 
	Whirlpool W10918546 Igniter

features: 
	['This is a Genuine OEM Replacement Part.']

description: 
	['Whirlpool Igniter']

price: 
	25.07

details: 
	Manufacturer: 
		Whirlpool
	Part Number: 
		W10918546
	Item Weight: 
		1 pounds
	Product Dimensions: 
		1 x 1 x 1 inches
	Item model number: 
		W10918546
	Item Package Quantity: 
		1
	Batteries Included?: 
		No
	Batteries Required?: 
		No
	Warranty Description: 
		This is a Genuine OEM Replacement Part.
	Best Sellers Rank: 
		{'Tools & Home Improvement': 655322, 'Parts & Accessories': 93370}
	Date First Available: 
		April 19, 2019



In [19]:
characters = []
prices = []
for d in data:
    content = d['title'] + str(d['features']) + str(d['description']) + str(d['details'])
    characters.append(len(content))
    prices.append(float(d['price']))


In [None]:
avr = sum(characters)/len(characters)
plt.figure(figsize=(15,6))
plt.title((f"Number of characters: Avr {avr:,.0f} and highest {max(characters)}"
          f"    Approximate Avr tokens: {avr//4:,.0f}"))
plt.xlabel("Number of characters")
plt.ylabel("Count")
plt.hist(characters, rwidth=0.7, color="deepskyblue", bins=range(0, 6000, 100))
plt.show()

In [None]:
plt.figure(figsize=(15,6))
plt.title(f"Prices: Avr {sum(prices)/len(prices):,.2f} and highest {max(prices):,}")
plt.xlabel("Prices")
plt.ylabel("Count")
plt.hist(prices, rwidth=0.7, color="salmon", bins=range(0, 600, 10))
plt.show()

In [231]:
# # ESCAPE = """\	\ !"\#\$%\&'\(\)\*\+,\-\./:;<=>\?@\[\\\]\^`\{\|\}\~ ¡£¤¥¦§¨©«¬­®¯°±´·¸»¿×˚́̊  ​‍‎‏‐‑‒–—―‘’‚“”„•…  ‰′″›※‼‿⁄€⃣℃℉™↑→↖↘↙∕√∞≈≤≥≦≪⌀⌛⌦⏩⏱⏳⏺ⒶⒷⒸⒹⒺⓇ■▣▪▲△▶▷▸►▼▽◀◄◆◇◈◉◎●◕◢◤◥◦◼◾☀☁☃★☆☉☎☑☔☕☛☝☞☪☰☺☻♀♂♐♕♚♛♠♡♣♥♦♨♪♫♬♻⚒⚗⚙⚠⚡⚪⚫⛅⛓⛔⛳✂✅✆✈✉✊✋✌✍✎✏✐✓✔✚✤✦✧✨✩✪✫✮✯✸✽✾✿❀❁❃❄❅❆❇❉❋❌❎❓❖❗❛❜❣❤❥❰❱➕➡➢➤➥➬➯➱➼➽⬛⬜⭐⭕　、。〃《》「」『』【】〔〕〖〗〚〛〜・㋡㎛㎡︎️﻿！＂＃＄％＆（）＋，－．／：；＜＞？［］｜～｡･�"""
# ESCAPE = """\	\ !"\#\$%\&'\(\)\*\+,\-\./:;<=>\?@\[\\\]\^`\{\|\}\~ ¡£¤¥¦§¨©«¬­®¯°±´·¸»¿×˚́̊  ​‍‎‏‐‑‒–—―‘’‚“”„•…  ‰′″›※‼‿⁄€⃣℃℉™↑→↖↘↙∕√∞≈≤≥≦≪⌀⌛⌦⏩⏱⏳⏺ⒶⒷⒸⒹⒺⓇ■▣▪▲△▶▷▸►▼▽◀◄◆◇◈◉◎●◕◢◤◥◦◼◾☀☁☃★☆☉☎☑☔☕☛☝☞☪☰☺☻♀♂♐♕♚♛♠♡♣♥♦♨♪♫♬♻⚒⚗⚙⚠⚡⚪⚫⛅⛓⛔⛳✂✅✆✈✉✊✋✌✍✎✏✐✓✔✚✤✦✧✨✩✪✫✮✯✸✽✾✿❀❁❃❄❅❆❇❉❋❌❎❓❖❗❛❜❣❤❥❰❱➕➡➢➤➥➬➯➱➼➽⬛⬜⭐⭕　、。〃《》「」『』【】〔〕〖〗〚〛〜・㋡㎛㎡︎️﻿！＂＃＄％＆（）＋，－．／：；＜＞？［］｜～｡･�"""

# # ESCAPE = """[\\!"\\#\\$%\\&'\\(\\)\\*\\+,\\-\\./:;<=>\\?@\\[\\\]\\^`\\{\\|\\}\\~¡£¤¥¦§¨©«¬­®¯°±´·¸»¿×˚́̊​‍‎‏‐‑‒–—―‘’‚“”„•…‰′″›※‼‿⁄€⃣℃℉™↑→↖↘↙∕√∞≈≤≥≦≪⌀⌛⌦⏩⏱⏳⏺ⒶⒷⒸⒹⒺⓇ■▣▪▲△▶▷▸►▼▽◀◄◆◇◈◉◎●◕◢◤◥◦◼◾☀☁☃★☆☉☎☑☔☕☛☝☞☪☰☺☻♀♂♐♕♚♛♠♡♣♥♦♨♪♫♬♻⚒⚗⚙⚠⚡⚪⚫⛅⛓⛔⛳✂✅✆✈✉✊✋✌✍✎✏✐✓✔✚✤✦✧✨✩✪✫✮✯✸✽✾✿❀❁❃❄❅❆❇❉❋❌❎❓❖❗❛❜❣❤❥❰❱➕➡➢➤➥➬➯➱➼➽⬛⬜⭐⭕、。〃《》「」『』【】〔〕〖〗〚〛〜・㋡㎛㎡︎️！＂＃＄％＆（）＋，－．／：；＜＞？［］｜～｡･�]"""

# whitespace_pattern = re.compile(
#     r'[\s\u00A0\u1680\u2000-\u200A\u202F\u205F\u3000\uFEFF]', flags=re.UNICODE
# )
# CLEANED_ESCAPE = whitespace_pattern.sub("", ESCAPE)
# # SAFE_ESCAPE = CLEANED_ESCAPE

# SAFE_ESCAPE = "[" + re.escape(CLEANED_ESCAPE) + ']'
# # # SAFE_ESCAPE = '[{}]'

# print(ESCAPE_PATTERN.pattern)







class Item:
    BASE_MODEL = "meta-llama/Meta-Llama-3.1-8B"
    
    MIN_TOKENS = 150 # Any less than this, and we don't have enough useful content
    MAX_TOKENS = 160 # Truncate after this many tokens. Then after adding in prompt text, we will get to around 180 tokens
    
    MIN_CHARS = 300
    CEILING_CHARS = MAX_TOKENS * 5

    SAFE_ESCAPE = """[\\\\!"\\\#\\\$%\\\&'\\\(\\\)\\\*\\\+,\\\-\\\./:;<=>\\\?@\\\[\\\\\]\\\^`\\\{\\\|\\\}\\\~¡£¤¥¦§¨©«¬­®¯°±´·¸»¿×˚́̊​‍‎‏‐‑‒–—―‘’‚“”„•…‰′″›※‼‿⁄€⃣℃℉™↑→↖↘↙∕√∞≈≤≥≦≪⌀⌛⌦⏩⏱⏳⏺ⒶⒷⒸⒹⒺⓇ■▣▪▲△▶▷▸►▼▽◀◄◆◇◈◉◎●◕◢◤◥◦◼◾☀☁☃★☆☉☎☑☔☕☛☝☞☪☰☺☻♀♂♐♕♚♛♠♡♣♥♦♨♪♫♬♻⚒⚗⚙⚠⚡⚪⚫⛅⛓⛔⛳✂✅✆✈✉✊✋✌✍✎✏✐✓✔✚✤✦✧✨✩✪✫✮✯✸✽✾✿❀❁❃❄❅❆❇❉❋❌❎❓❖❗❛❜❣❤❥❰❱➕➡➢➤➥➬➯➱➼➽⬛⬜⭐⭕、。〃《》「」『』【】〔〕〖〗〚〛〜・㋡㎛㎡︎️！＂＃＄％＆（）＋，－．／：；＜＞？［］｜～｡･�]"""
    ESCAPE_PATTERN = re.compile(SAFE_ESCAPE, flags=re.UNICODE)

    tokenizer = AutoTokenizer.from_pretrained(Item.BASE_MODEL, trust_remote_code=True)
    PREFIX = "Price is $"
    QUESTION = "How much does this cost to the nearest dollar?"
    REMOVALS = ['"Batteries Included?": "No"', '"Batteries Included?": "Yes"', 
                '"Batteries Required?": "No"', '"Batteries Required?": "Yes"', 
                "By Manufacturer", "Item", "Date First", "Package", ":", "Number of",
                "Best Sellers", "Number", "Product "]
    
    
    def __init__(self, datapoint, price):
        self.title = datapoint['title']
        self.price = price
        self.parse(datapoint)
        self.category: str
        self.token_count: int = 0
        self.details: Optional[str]
        self.prompt: Optional[str] = None
        self.include: False
        
    def parse(self, stuff):
        descr = datapoint['description']
        features = datapoint['features']
        self.details = datapoint['details']
        
        content = ""
        if descr:
            content += "\n".join(descr) + "\n"
        if features:
            content += "\n".join(features) + "\n"
        if self.details:
            content += self.scrub_details() + "\n"
        
        self.content = self.scrub(content)
        if len(content) > Item.MIN_CHARS:
            content = content[:Item.CEILING_CHARS]
            text =f"{self.title}\n{self.content}"
            tokens = self.tokenizer.encode(text, add_special_tokens=False)
            if len(tokens) > Item.MIN_TOKENS:
                tokens = tokens[:Item.MAX_TOKENS]
                text = self.tokenizer.decode(tokens)
                self.make_prompt(text)
                self.include=True
                

    def scrub_details(self):
        details = self.details
        for remove in Item.REMOVALS:
            details = details.replace(remove, "")
        return details
    
    def scrub(self, stuff):
        stuff = Item.ESCAPE_PATTERN.sub('', stuff).strip()
        words = stuff.split(" ")
        filtered = [w for w in words if len(w) < 6 or not any(char.isdigit() for char in w)]
        return " ".join(filtered)

    def make_prompt(self, text):
        self.prompt = f"{Item.QUESTION}\n\n{text}\n\n"
        self.promt += f"{Item.PREFIX}{self.price:.2f}"
        self.token_count = len(self.tokenizer.encode(self.prompt, add_special_tokens=False))

    def test_prompt(self):
        return self.prompt.split(item.PREFIX)[0] + item.PREFIX
        
    def __repr__(self):
        return f"Title: {self.title}\nPrice: {self.price:.2f}\nContent: {self.content}"
        

In [232]:
item = Item(data[100], float(data[100]['price']))
print()
print(item)


Title: Broan BPQTAF Aluminum Replacement Filter for QT20000 Range Hoods
Price: 9.50
Content: Rack Rollers Compatible With GE Dishwasher Non-OEM replacement Replaces 2493 DW113 DW604 Color Rack Rollers Compatible With GE YEAR WARRANTY
Replaces 2493 DW113 DW604 White
Manufacturer Edgewater Parts Part   Weight 6 ounces  Dimensions 6 x 6 x 1 inches Is Discontinued  No    Rank Tools  Home Improvement Parts  Accessories  Available June 1 2016


In [212]:
escape = """\	\ !"\#\$%\&'\(\)\*\+,\-\./:;<=>\?@\[\\\]\^`\{\|\}\~ ¡£¤¥¦§¨©«¬­®¯°±´·¸»¿×˚́̊  ​‍‎‏‐‑‒–—―‘’‚“”„•…  ‰′″›※‼‿⁄€⃣℃℉™↑→↖↘↙∕√∞≈≤≥≦≪⌀⌛⌦⏩⏱⏳⏺ⒶⒷⒸⒹⒺⓇ■▣▪▲△▶▷▸►▼▽◀◄◆◇◈◉◎●◕◢◤◥◦◼◾☀☁☃★☆☉☎☑☔☕☛☝☞☪☰☺☻♀♂♐♕♚♛♠♡♣♥♦♨♪♫♬♻⚒⚗⚙⚠⚡⚪⚫⛅⛓⛔⛳✂✅✆✈✉✊✋✌✍✎✏✐✓✔✚✤✦✧✨✩✪✫✮✯✸✽✾✿❀❁❃❄❅❆❇❉❋❌❎❓❖❗❛❜❣❤❥❰❱➕➡➢➤➥➬➯➱➼➽⬛⬜⭐⭕　、。〃《》「」『』【】〔〕〖〗〚〛〜・㋡㎛㎡︎️﻿！＂＃＄％＆（）＋，－．／：；＜＞？［］｜～｡･�"""
# escape = """[!"#%&'()+,./:;?\{|}’”▶【】（）]"""

safe_escape = "[" + re.escape(escape) + ']'
escape_pattern = re.compile(safe_escape, flags=re.UNICODE)

allowed_pattern = re.compile("[\w\U00010000-\U0010FFFF]", flags=re.UNICODE)
# allowed_pattern = re.compile("[\w ]", flags=re.UNICODE)


all_symbols = set()
escape_symbols = set()
allowed_symbols = set()

for datapoint in data:
    item = Item(datapoint, float(datapoint['price']))
    all_symbols |= set(item.content)
    escape_symbols |= set(allowed_pattern.sub("", item.content))
    allowed_symbols |= set(escape_pattern.sub("", item.content))

all_symbols = sorted(all_symbols)
escape_symbols = sorted(escape_symbols)
allowed_symbols = sorted(allowed_symbols)

In [None]:
print(f"All Symbols: [{''.join(all_symbols)}]\n")
print(f"Escape Symbols: [{''.join(escape_symbols)}]\n")
print(f"Allowed Symbols: [{''.join(allowed_symbols)}]\n")
