## Download the data

In [3]:
fname_products = "meta_Clothing_Shoes_and_Jewelry.json.gz"
fname_vectors = "image_features_Clothing_Shoes_and_Jewelry.b"

#!wget -nc http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/{fname_products}
!wget -c http://snap.stanford.edu/data/amazon/productGraph/image_features/categoryFiles/{fname_vectors}

!du -hs meta_Clothing_Shoes_and_Jewelry.json.gz
!du -hs image_features_Clothing_Shoes_and_Jewelry.b

--2021-02-11 22:22:47--  http://snap.stanford.edu/data/amazon/productGraph/image_features/categoryFiles/image_features_Clothing_Shoes_and_Jewelry.b
Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:80... connected.
HTTP request sent, awaiting response... 206 Partial Content
Length: 24495439374 (23G), 500269078 (477M) remaining [text/plain]
Saving to: ‘image_features_Clothing_Shoes_and_Jewelry.b’

image_features_Clot 100%[+++++++++++++++++++>]  22.81G   759KB/s    in 3m 33s  

2021-02-11 22:26:21 (2.23 MB/s) - ‘image_features_Clothing_Shoes_and_Jewelry.b’ saved [24495439374/24495439374]

272M	meta_Clothing_Shoes_and_Jewelry.json.gz
 23G	image_features_Clothing_Shoes_and_Jewelry.b


## Explore the data

Import required libraries

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
from amazonutils import *
from itertools import islice
from tqdm import tqdm
from pprint import pprint, pformat
from IPython.display import Image, display, Markdown, Code, HTML
import matplotlib.pyplot as plt
import numpy as np
import json

Iterate over the metadata for a few products

In [None]:
for p in islice(iter_products(fname_products), 5, 8):
  d = {k:v for (k,v) in p.items() if k not in {'related', 'description'}}
  pprint(d)
  display(Image(p['imUrl'], width=128, height=128))

Lets see the first three values of the 4096 floats image vectors:

In [None]:
for (asin, vec) in islice(iter_vectors(fname_vectors), 3):
  print(asin, len(vec), vec[:3])

Reduce vector dimensionality:

In [None]:
vector_dims = 256
reduced = iter_vectors_reduced(fname_vectors, dims=vector_dims, samples=10000)

for (asin, vec) in islice(reduced(fname_vectors), 3):
  print(asin, len(vec), vec[:3])

sample = np.array([v for (_, v) in islice(reduced(fname_vectors), 20000)])
plt.title("Shape: %s, mean: %.3f" % (sample.shape, sample.mean()))
plt.hist(np.ravel(sample), bins=40, log=True)
plt.show()

## Create Vespa Application Package

Create an ApplicationPackage instance to hold all relevant info about our search application.

In [None]:
from vespa.package import ApplicationPackage, Field

app_package = ApplicationPackage(name = "product_search")

In [None]:

app_package.schema.add_fields(        
    Field(name = "asin", type = "string", indexing = ["attribute", "summary"]),
    Field(name = "title", type = "string", indexing = ["index", "summary"], index = "enable-bm25"),
    Field(name = "description", type = "string", indexing = ["index", "summary"], index = "enable-bm25"),
    Field(name = "price", type = "float", indexing = ["attribute", "summary"]),
    Field(name = "image_vector", type = "tensor<float>(x[4096])", indexing = ["attribute"]),
    Field(name = "reduced_image_vector", type = "tensor<float>(x[256])", indexing = ["attribute"]),
)

## Deploy your application

In [None]:
from vespa.package import VespaDocker

vespa_docker = VespaDocker(port=8089)

app = vespa_docker.deploy(
    application_package = app_package,
    disk_folder=disk_folder # include the desired absolute path here
)

## Feed data to your application