In [1]:
import sys

# This will print the exact path to the Python executable that is running this notebook's code.
print(sys.executable)

/Users/qyxmacmini/Documents/GitHub/e-commerce-chatbot/.venv/bin/python


In [2]:
import os
from dotenv import load_dotenv
from pathlib import Path

# --- Load Project-Specific Environment Variables ---
# This is the key step: It searches for a .env file and loads it.
# It's smart enough to search up from the current directory to find it.
load_dotenv()

# You can now verify that the environment variable is set for this session
hf_home = os.getenv("HF_HOME")
print(f"Hugging Face cache is set to: {hf_home}")

# ... rest of your notebook

Hugging Face cache is set to: /Volumes/ExtremeSSD/workingspace/ChatBotAmazon/data


In [None]:
import pandas as pd
from datasets import load_dataset

# --- Load Metadata (from your original code) ---
# This loads from the cache after the first run
dataset_meta = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_Electronics", trust_remote_code=True)
meta_df = pd.DataFrame(dataset_meta['full'])


# --- Load Reviews (without streaming) ---
# This will download the dataset once and cache it for future use.
# Subsequent runs will be much faster.
print("Loading reviews dataset. This may take a while on the first run...")
dataset_reviews = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Electronics", trust_remote_code=True)

# Take a sample of 1000 reviews to work with
reviews_sample = dataset_reviews['full'].shuffle(seed=42).select(range(1000))
reviews_df = pd.DataFrame(reviews_sample)

print("Metadata and a sample of reviews have been loaded into DataFrames.")
reviews_df.head()

# You can run this cell multiple times without any downloading.
# It uses the 'reviews_df' DataFrame we already loaded in memory.

print("Inspecting raw 'description' text from the pre-loaded DataFrame:")
for i, desc in enumerate(reviews_df['text'].dropna().sample(3, random_state=42)):
    print(f"--- Sample {i+1} ---")
    print(desc)
    print("\\n")

In [4]:
import pandas as pd
from datasets import load_dataset

# --- Load Metadata (from your original code) ---
# This loads from the cache after the first run
dataset_meta = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_Electronics", trust_remote_code=True)
dataset_meta['full'][0]

{'main_category': 'All Electronics',
 'title': 'FS-1051 FATSHARK TELEPORTER V3 HEADSET',
 'average_rating': 3.5,
 'rating_number': 6,
 'features': [],
 'description': ['Teleporter V3 The “Teleporter V3” kit sets a new level of value in the FPV world with Fat Shark renowned performance and quality. The fun of FPV is experienced firsthand through the large screen FPV headset with integrated NexwaveRF receiver technology while simultaneously recording onboard HD footage with the included “PilotHD” camera. The “Teleporter V3” kit comes complete with everything you need to step into the cockpit of your FPV vehicle. We’ve included our powerful 250mW 5.8Ghz transmitter, 25 degree FOV headset (largest QVGA display available), the brand new “PilotHD” camera with live AV out and all the cables, antennas and connectors needed.'],
 'price': 'None',
 'images': {'hi_res': [None],
  'large': ['https://m.media-amazon.com/images/I/41qrX56lsYL._AC_.jpg'],
  'thumb': ['https://m.media-amazon.com/images/I

In [1]:
import pandas as pd
from datasets import load_dataset

# Load the user reviews for the "Electronics" category
# This dataset is large, so we'll again stream it and take a sample.
# The name for the electronics reviews is "raw_review_Electronics"
streaming_dataset_reviews = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Electronics", trust_remote_code=True, split="full", streaming=True)

# Let's take a sample of 1000 reviews to start
sample_reviews = list(streaming_dataset_reviews.take(1000))

# Create a DataFrame
reviews_df = pd.DataFrame(sample_reviews)

# Display the first few rows of the reviews DataFrame
print("Shape of the reviews DataFrame:", reviews_df.shape)
reviews_df.head()

Shape of the reviews DataFrame: (1000, 10)


Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
0,3.0,Smells like gasoline! Going back!,First & most offensive: they reek of gasoline ...,[{'small_image_url': 'https://m.media-amazon.c...,B083NRGZMM,B083NRGZMM,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,1658185117948,0,True
1,1.0,Didn’t work at all lenses loose/broken.,These didn’t work. Idk if they were damaged in...,[],B07N69T6TM,B07N69T6TM,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,1592678549731,0,True
2,5.0,Excellent!,I love these. They even come with a carry case...,[],B01G8JO5F2,B01G8JO5F2,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,1523093017534,0,True
3,5.0,Great laptop backpack!,I was searching for a sturdy backpack for scho...,[],B001OC5JKY,B001OC5JKY,AGGZ357AO26RQZVRLGU4D4N52DZQ,1290278495000,18,True
4,5.0,Best Headphones in the Fifties price range!,I've bought these headphones three times becau...,[],B013J7WUGC,B07CJYMRWM,AG2L7H23R5LLKDKLBEF2Q3L2MVDA,1676601581238,0,True


In [4]:
# Display 20 random samples to get a better feel for the data
print("Displaying 20 random samples from the metadata:")
meta_df.sample(20)

import pandas as pd

# Calculate missing values and fill rate
missing_info = pd.DataFrame({
    'missing_count': meta_df.isnull().sum(),
    'fill_rate (%)': (1 - (meta_df.isnull().sum() / len(meta_df))) * 100
})

# Sort by fill rate to see the least complete columns first
missing_info.sort_values(by='fill_rate (%)', ascending=True)

Displaying 20 random samples from the metadata:


Unnamed: 0,missing_count,fill_rate (%)
bought_together,1610012,0.0
author,1609536,0.029565
subtitle,1609161,0.052857
main_category,106334,93.395453
store,9520,99.4087
title,0,100.0
average_rating,0,100.0
rating_number,0,100.0
features,0,100.0
description,0,100.0


In [10]:
# Inspect the raw text of the 'description' field for a few products
print("Raw 'description' text for a few sample products:")
for i, desc in enumerate(meta_df['description'].dropna().sample(3, random_state=42)):
    print(f"--- Sample {i+1} ---")
    print(desc)
    print("\\n")

Raw 'description' text for a few sample products:
--- Sample 1 ---
['NAD - C 372 - Integrated Amplifier\xa0Technical Details * 2 x 150W Minimum Continuous Power into 4/8 ohms * 220W, 340W, 460W IHF Dynamic Power into 8, 4 and 2 ohms, respectively * High Current Holmgren? Toroidal Power Transformer * A/B Speaker outputs w']
\n
--- Sample 2 ---
['Protect your iPod with a design that is truly out of this world! GEAR4 is proud to offer this exclusive range of officially licensed iPod Touch cases as a companion to the worldwide phenomenon Angry Birds Space. This case is made from strong high gloss plastic that clips easily onto your iPod Touch covering the back and all sides from accidental bumps and scratches. The case has cut outs allowing easy access to all ports on the iPod, as well as a camera cut out and access to volume controls.']
\n
--- Sample 3 ---
[]
\n


In [11]:
from bs4 import BeautifulSoup

def clean_html(text):
    """
    Removes HTML tags from a given text string.
    Returns an empty string if the input is not a string.
    """
    if not isinstance(text, str):
        return ""
    # The 'lxml' parser is fast, but 'html.parser' is built-in if you don't have lxml installed.
    soup = BeautifulSoup(text, "lxml")
    return soup.get_text(separator=" ", strip=True)

print("`clean_html` function is defined and ready to use.")

`clean_html` function is defined and ready to use.


In [12]:
# Get the same 3 samples to test our cleaning function
sample_descriptions = meta_df['description'].dropna().sample(3, random_state=42)

print("Testing the HTML cleaning script:\\n")
for i, desc in enumerate(sample_descriptions):
    print(f"--- Sample {i+1} ---")
    print(f"Original: {desc[:300]}...") # Print first 300 chars of original
    cleaned_desc = clean_html(desc)
    print(f"Cleaned:  {cleaned_desc[:300]}...") # Print first 300 chars of cleaned
    print("\\n")

Testing the HTML cleaning script:\n
--- Sample 1 ---
Original: ['NAD - C 372 - Integrated Amplifier\xa0Technical Details * 2 x 150W Minimum Continuous Power into 4/8 ohms * 220W, 340W, 460W IHF Dynamic Power into 8, 4 and 2 ohms, respectively * High Current Holmgren? Toroidal Power Transformer * A/B Speaker outputs w']...
Cleaned:  ...
\n
--- Sample 2 ---
Original: ['Protect your iPod with a design that is truly out of this world! GEAR4 is proud to offer this exclusive range of officially licensed iPod Touch cases as a companion to the worldwide phenomenon Angry Birds Space. This case is made from strong high gloss plastic that clips easily onto your iPod Touch covering the back and all sides from accidental bumps and scratches. The case has cut outs allowing easy access to all ports on the iPod, as well as a camera cut out and access to volume controls.']...
Cleaned:  ...
\n
--- Sample 3 ---
Original: []...
Cleaned:  ...
\n


In [9]:
from bs4 import BeautifulSoup

def robust_clean_html(data):
    """
    Handles list or string input and removes HTML tags.
    """
    text = ""
    if isinstance(data, list) and len(data) > 0:
        # If it's a list, take the first element
        text = data[0]
    elif isinstance(data, str):
        text = data

    if not isinstance(text, str):
        return ""

    soup = BeautifulSoup(text, "lxml")
    return soup.get_text(separator=" ", strip=True)

# --- Let's test it on the same samples ---
sample_descriptions = meta_df['description'].dropna().sample(3, random_state=42)

print("Testing the ROBUST HTML cleaning script:\\n")
for i, desc in enumerate(sample_descriptions):
    # Notice we pass the raw data (which is a list) to the function
    cleaned_desc = robust_clean_html(desc)
    print(f"--- Sample {i+1} ---")
    print(f"Original: {str(desc)[:100]}...")
    print(f"Cleaned:  {cleaned_desc[:100]}...")
    print("\\n")

Testing the ROBUST HTML cleaning script:\n
--- Sample 1 ---
Original: ['NAD - C 372 - Integrated Amplifier\xa0Technical Details * 2 x 150W Minimum Continuous Power into 4...
Cleaned:  NAD - C 372 - Integrated Amplifier Technical Details * 2 x 150W Minimum Continuous Power into 4/8 oh...
\n
--- Sample 2 ---
Original: ['Protect your iPod with a design that is truly out of this world! GEAR4 is proud to offer this excl...
Cleaned:  Protect your iPod with a design that is truly out of this world! GEAR4 is proud to offer this exclus...
\n
--- Sample 3 ---
Original: []...
Cleaned:  ...
\n


  soup = BeautifulSoup(text, "lxml")


In [8]:
import pandas as pd
import numpy as np

def is_actually_empty(value):
    """
    A truly robust function to check for empty values in the DataFrame.
    The order of these checks is critical.
    """
    # 1. First, check for list-like or dict-like objects.
    #    Their emptiness is determined by their length.
    if isinstance(value, (list, dict, np.ndarray)):
        return len(value) == 0

    # 2. Now it's safe to check for standard null values (for scalars).
    if pd.isnull(value):
        return True

    # 3. Finally, check for empty strings.
    if isinstance(value, str):
        return value.strip() == ''

    # If none of the above conditions are met, the value is not empty.
    return False

# Use the new, truly robust function
empty_counts = meta_df.map(is_actually_empty).sum()

# Create the accurate completeness report
realistic_completeness_report = pd.DataFrame({
    'empty_count': empty_counts,
    'realistic_fill_rate (%)': (1 - (empty_counts / len(meta_df))) * 100
})

# Display the report
realistic_completeness_report.sort_values(by='realistic_fill_rate (%)', ascending=False)

Unnamed: 0,empty_count,realistic_fill_rate (%)
average_rating,0,100.0
rating_number,0,100.0
price,0,100.0
images,0,100.0
videos,0,100.0
details,0,100.0
parent_asin,0,100.0
title,94,99.994162
store,9522,99.408576
main_category,106334,93.395453


In [10]:
# Show the most common values in the 'price' column
print("Most frequent values in the 'price' column:")
print(meta_df['price'].value_counts().head())

# Show a few examples of products where the price is 0
print("\\n--- Examples of products with a price of 0.0 ---")
print(meta_df[meta_df['price'] == 0.0][['title', 'price']].head())

Most frequent values in the 'price' column:
price
None     1083247
9.99       15517
19.99      11481
8.99       10287
7.99        9309
Name: count, dtype: int64
\n--- Examples of products with a price of 0.0 ---
Empty DataFrame
Columns: [title, price]
Index: []


In [14]:
import pandas as pd

print("Checking all columns for the literal string 'None'...")

string_none_counts = {}
for col in meta_df.columns:
    # We can only check columns that have a string-like data type
    if meta_df[col].dtype == 'object':
        try:
            count = meta_df[meta_df[col] == 'None'].shape[0]
            if count > 0:
                string_none_counts[col] = count
        except TypeError:
            # This can happen if a column has mixed types. We'll ignore errors.
            pass

if string_none_counts:
    print("\\nFound columns containing the string 'None':")
    for col, count in string_none_counts.items():
        print(f"- {col}: {count}")
else:
    print("\\nNo columns were found to contain the string 'None'.")

Checking all columns for the literal string 'None'...
\nFound columns containing the string 'None':
- price: 1083247
- store: 39


In [7]:
import pandas as pd
import numpy as np

def is_empty_definitive(value):
    """
    Definitive function that correctly handles all known empty cases,
    including the literal string 'None'.
    """
    if value == 'None':
        return True
    if isinstance(value, (list, dict, np.ndarray)):
        return len(value) == 0
    if pd.isnull(value):
        return True
    if isinstance(value, str):
        return value.strip() == ''
    return False

print("--- Comprehensive Column Analysis ---\\n")

# Define how to handle different columns
categorical_cols = ['store', 'main_category', 'average_rating']
numerical_cols = ['price', 'rating_number']
high_cardinality_cols = ['title', 'description']

for col in meta_df.columns:
    print(f"--- Analyzing column: '{col}' ---")

    # 1. First, calculate and print the definitive fill rate for every column
    empty_count = meta_df[col].map(is_empty_definitive).sum()
    fill_rate = (1 - (empty_count / len(meta_df))) * 100
    print(f"Definitive Fill Rate: {fill_rate:.2f}%")

    # 2. Then, perform the appropriate detailed analysis
    if col in categorical_cols:
        print("Top 5 most frequent values:")
        print(meta_df[col].value_counts().head(5))

    elif col in numerical_cols:
        print("Descriptive statistics:")
        numeric_series = pd.to_numeric(meta_df[col], errors='coerce')
        print(numeric_series.describe())

    elif col in high_cardinality_cols:
        try:
            # This will work for 'title' but fail for 'description' because it contains lists
            uniqueness_ratio = meta_df[col].nunique() / len(meta_df)
            print(f"Uniqueness ratio: {uniqueness_ratio:.4f}")
        except TypeError:
            # FIX: If nunique fails, it's because the column has unhashable lists.
            # We can convert to strings to get an approximate uniqueness.
            print("Column contains unhashable lists. Calculating uniqueness on string-converted values.")
            uniqueness_ratio = meta_df[col].astype(str).nunique() / len(meta_df)
            print(f"Uniqueness ratio (as string): {uniqueness_ratio:.4f}")

    else:
        print(f"Skipping detailed analysis for column '{col}'.")

    print("\\n" + "="*50 + "\\n")

--- Comprehensive Column Analysis ---\n
--- Analyzing column: 'main_category' ---
Definitive Fill Rate: 93.40%
Top 5 most frequent values:
main_category
Computers                    418868
All Electronics              376435
Camera & Photo               223690
Cell Phones & Accessories    138237
Home Audio & Theater         106516
Name: count, dtype: int64
--- Analyzing column: 'title' ---
Definitive Fill Rate: 99.99%
Uniqueness ratio: 0.9636
--- Analyzing column: 'average_rating' ---
Definitive Fill Rate: 100.00%
Top 5 most frequent values:
average_rating
5.0    223376
4.0    115500
4.5    111736
4.4     94421
4.3     91925
Name: count, dtype: int64
--- Analyzing column: 'rating_number' ---
Definitive Fill Rate: 100.00%
Descriptive statistics:
count    1.610012e+06
mean     1.804833e+02
std      2.543980e+03
min      1.000000e+00
25%      3.000000e+00
50%      1.200000e+01
75%      4.900000e+01
max      1.034896e+06
Name: rating_number, dtype: float64
--- Analyzing column: 'features' 

In [10]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

# --- We copy the exact same helper functions from the script ---

def robust_clean_html(data):
    """
    Handles list or string input, cleans HTML tags, and returns clean text.
    """
    text_to_clean = ""
    if isinstance(data, list) and len(data) > 0:
        text_to_clean = data[0]
    elif isinstance(data, str):
        text_to_clean = data

    if not isinstance(text_to_clean, str):
        return "" # Return empty string if input is not text

    soup = BeautifulSoup(text_to_clean, "lxml")
    return soup.get_text(separator=" ", strip=True)

def is_empty_definitive(value):
    """
    Definitive function that correctly handles all known empty cases.
    """
    if value == 'None':
        return True
    if isinstance(value, (list, dict, np.ndarray)):
        return len(value) == 0
    if pd.isnull(value):
        return True
    if isinstance(value, str):
        return value.strip() == ''
    return False

def create_knowledge_document(product: dict) -> str:
    """
    Creates a single text document for a product, handling missing fields.
    """
    doc = f"Product Title: {product.get('title', 'N/A')}\\n"
    if not is_empty_definitive(product.get('store')):
        doc += f"Brand: {product.get('store')}\\n"
    if not is_empty_definitive(product.get('price')):
        try:
            price = float(product.get('price'))
            doc += f"Price: ${price:.2f}\\n"
        except (ValueError, TypeError):
            pass
    if not is_empty_definitive(product.get('features')):
        cleaned_features = [robust_clean_html(f) for f in product.get('features')]
        if cleaned_features:
            doc += "Features:\\n"
            for feature in cleaned_features:
                doc += f"- {feature}\\n"
    if not is_empty_definitive(product.get('description')):
        cleaned_description = robust_clean_html(product.get('description'))
        if cleaned_description:
            doc += f"Description: {cleaned_description}\\n"
    return doc

# --- Main Preview Logic ---

# Take a random sample of 10 products from our already loaded DataFrame
sample_df = meta_df.sample(10, random_state=42)

print("--- Previewing 10 Example Rows for knowledge_base.csv ---\\n")

# Process and print the sample
for index, row in sample_df.iterrows():
    product_dict = row.to_dict()

    parent_asin = product_dict.get('parent_asin')
    knowledge_doc = create_knowledge_document(product_dict)

    print(f"parent_asin: {parent_asin}")
    print("-------------------- knowledge_doc --------------------")
    print(knowledge_doc)
    print("="*60 + "\\n")

--- Previewing 10 Example Rows for knowledge_base.csv ---\n
parent_asin: B000V4DOY4
-------------------- knowledge_doc --------------------
Product Title: iPhone 8 Lightning Adapter Headphone Metal Case Headphone Audio Adapter 2.4A Quick Charge Earphones Splitter - compatible with IOS 11 ZERKAR (Black-lightning^)\nBrand: zerkar\nFeatures:\n- Toroidal Power Transformer\n- A/B Speaker outputs w\nDescription: NAD - C 372 - Integrated Amplifier Technical Details * 2 x 150W Minimum Continuous Power into 4/8 ohms * 220W, 340W, 460W IHF Dynamic Power into 8, 4 and 2 ohms, respectively * High Current Holmgren? Toroidal Power Transformer * A/B Speaker outputs w\n
parent_asin: B007WPHXA6
-------------------- knowledge_doc --------------------
Product Title: Gear4 Angry Birds SpaceTouch Case for iPod 4G - Fire Bomb Bird\nBrand: Gear4\nFeatures:\n- Camera hole\n- Clip-On design\n- Full access to all ports\nDescription: Protect your iPod with a design that is truly out of this world! GEAR4 is proud

  soup = BeautifulSoup(text_to_clean, "lxml")


In [2]:
import pandas as pd

# Define the path to the output file
# This path is relative to the project's root directory
csv_path = "../data/knowledge_base.csv"

# Read the CSV file into a pandas DataFrame
try:
    knowledge_df = pd.read_csv(csv_path)

    # Print the first 5 rows
    print("--- First 5 rows of knowledge_base.csv ---")
    print(knowledge_df.head())

except FileNotFoundError:
    print(f"Error: The file was not found at '{csv_path}'.")
    print("Please make sure you have run the 'scripts/1_build_knowledge_base.py' script first.")

--- First 5 rows of knowledge_base.csv ---
  parent_asin                                      knowledge_doc
0  B00MCW7G9M  Product Title: FS-1051 FATSHARK TELEPORTER V3 ...
1  B00YT6XQSE  Product Title: Ce-H22B12-S1 4Kx2K Hdmi 4Port\n...
2  B07SM135LS  Product Title: Digi-Tatoo Decal Skin Compatibl...
3  B089CNGZCW  Product Title: NotoCity Compatible with Vivoac...
4  B004E2Z88O  Product Title: Motorola Droid X Essentials Com...


In [3]:
knowledge_df

Unnamed: 0,parent_asin,knowledge_doc
0,B00MCW7G9M,Product Title: FS-1051 FATSHARK TELEPORTER V3 ...
1,B00YT6XQSE,Product Title: Ce-H22B12-S1 4Kx2K Hdmi 4Port\n...
2,B07SM135LS,Product Title: Digi-Tatoo Decal Skin Compatibl...
3,B089CNGZCW,Product Title: NotoCity Compatible with Vivoac...
4,B004E2Z88O,Product Title: Motorola Droid X Essentials Com...
...,...,...
1610007,B003NUIU9M,Product Title: Wintec FileMate Pro USB Flash D...
1610008,B0BHVY33TL,Product Title: Tsugar Noise Reduction Wireless...
1610009,B09SQGRFFH,Product Title: Hardshell Case for MacBook Pro ...
1610010,B091JWCSG5,"Product Title: FYY 12-13.3"" Laptop Sleeve Case..."
