In [4]:
#import pandas as pd
import json
import numpy as np
from tqdm.notebook import tqdm
import uuid
import helper
import weaviate 
from weaviate.batch import Batch 

# initiate the Weaviate client
client = weaviate.Client("http://localhost:8080")
client.timeout_config = (3, 200)


# load all review data 
reviewData = []
with open('data/sample_Home_and_Kitchen_5.json', 'r') as f:
    for l in tqdm(f):
        reviewData.append(json.loads(l))

In [6]:
# load all metadata
count = 0
data = []
with open('data/sample_meta_Home_and_Kitchen.json', 'r') as f:
    for l in tqdm(f):
        data.append(json.loads(l))
        count = count + 1
        if count >= 3000:
          break


productData = []

for i in range(0, 3000):
    hasprice="price" in data[i]
    hastitle="title" in data[i]
    hasdescription="description" in data[i]
    hasmaincat='main_cat' in data[i]
   # print(i,  (hasprice and hastitle and hasdescription))
    if (hasprice and hastitle and hasdescription and hasmaincat)==True:
        if len(data[i]['title'])<200:
            productData.append(data[i])
    else:
        continue


for i in range(0, len(productData)): 
   productData[i]['description']=" ".join(productData[i]['description'])
   cleaned_text = productData[i]['description'].replace("\\", "")
   productData[i]['description']=cleaned_text
   productData[i]['price']=productData[i]['price'].replace("$","").replace(",","")
   productData[i]['price']=float(productData[i]['price'])
   if i == 0:
       print(productData[i]) 



0it [00:00, ?it/s]

{'category': ['Home & Kitchen', 'Vacuums & Floor Care'], 'description': 'Eureka Replacement Vacuum Belt', 'title': 'Eureka 54312-12 Vacuum Cleaner Belt', 'brand': 'Eureka', 'feature': ['Limit 1 per order', 'Returns will not be honored on this closeout item'], 'rank': '>#1,098,930 in Home & Kitchen (See Top 100 in Home & Kitchen)>#17,327 in Home & Kitchen > Vacuums & Floor Care', 'also_view': ['B004B54FM4', 'B014N37IBI', 'B00VH79FH4', 'B008MKNG6U', 'B001AO1VBW', 'B00TM8XQK2', 'B001EZIEOO', 'B013KYDLJY', 'B013JKGOH0', 'B0195UJPGU', 'B001ANZQSM', 'B00BY3VYFC', 'B00007E7OH'], 'main_cat': 'Amazon Home', 'price': 4.36, 'asin': 'B00002N62Y'}


In [10]:
## here needs a step to clean and process / filter out data 
## len(productData['title']) <100 
print(client.schema)

<weaviate.schema.crud_schema.Schema object at 0x7fd00c042f10>


In [8]:
client.schema.delete_all() # delete all classes

In [9]:
import weaviate

# skipped product "brand", it can be it's own class, for simplicity, skip it for now 
# skipped product "category" list of strings (text?) not sure how to type it yet, or if category should be it's own class?
# skipped "similar_item"
schema = {
    "classes": [
        {
            # name of the class
            "class": "Product",
            # class properties
            "properties": [
                {
                    "name": "asin",
                    "dataType": ["string"]
                },
                {
                    "name": "title",
                    "dataType": ["text"]
                },
                {
                    "dataType": ["number"],
                    "description": "The price product in dollars",
                    "name": "price"
                },
                {
                    "dataType": ["text"],
                    "name": "productDescription",
                    "description": "description of product"

                },
                {
                    "dataType": ["string"],
                    "name": "mainCat",
                    "description": "main category of the product in amazon "
                }
            ]
        }
    ]
}
client.schema.create(schema)

In [11]:
def prettify(json_dict): 
    print(json.dumps(json_dict, indent=2))

prettify(client.schema.get())

{
  "classes": [
    {
      "class": "Product",
      "invertedIndexConfig": {
        "bm25": {
          "b": 0.75,
          "k1": 1.2
        },
        "cleanupIntervalSeconds": 60,
        "stopwords": {
          "additions": null,
          "preset": "en",
          "removals": null
        }
      },
      "moduleConfig": {
        "text2vec-contextionary": {
          "vectorizeClassName": true
        }
      },
      "properties": [
        {
          "dataType": [
            "string"
          ],
          "moduleConfig": {
            "text2vec-contextionary": {
              "skip": false,
              "vectorizePropertyName": false
            }
          },
          "name": "asin",
          "tokenization": "word"
        },
        {
          "dataType": [
            "text"
          ],
          "moduleConfig": {
            "text2vec-contextionary": {
              "skip": false,
              "vectorizePropertyName": false
            }
          },
        

In [12]:
# test that the helper uuid generating function works 
helper.generate_uuid('product', productData[350]['title']+productData[350]['asin'])

'28c5e357-2d19-55d0-a685-1756ad0a7e47'

In [13]:
from weaviate.batch import Batch # for the typing purposes

# so far i only got to successfully import 'asin', 'title'

def add_product(batch: Batch, product_data: dict) -> str:
    product_object = {
        'asin': product_data['asin'],
        'title': product_data['title'],
        'main_cat': product_data['main_cat'],
        'description': product_data['description'],             
        #'feature': product_data['feature'] if 'feature' in product_data else '',
        'price': product_data['price']
    }
    # generate an UUID for the Author
    product_id = helper.generate_uuid('product', product_data['title']+product_data['asin'])
   
    # add article to the batch
    batch.add_data_object(  
        data_object=product_object,
        class_name='Product',
        uuid=product_id
    ) 
    return product_id





In [14]:
from tqdm import trange

# just to test a small number to see if it works 

for i in trange(500, 1000):
    product_id = add_product(client.batch, productData[i])    
    if i % 20 == 0:
        # submit the objects from the batch to weaviate
        client.batch.create_objects()
status_objects = client.batch.create_objects()
client.batch.flush()

100%|██████████| 500/500 [00:01<00:00, 409.42it/s]


In [21]:
result = client.query.get(class_name='Product', properties="title").do()
print(f"Number of reviews returned: {len(result['data']['Get']['Product'])}")
result


Number of reviews returned: 100


{'data': {'Get': {'Product': [{'title': '20 Fleece Vacuum Cleaner Bags, Filter Bags for Miele S 2121'},
    {'title': 'Genuine Sebo Vacuum Cleaner 36.5Mm Large Dusting Brush 1094Er'},
    {'title': '1 X Genuine DYSON DC25 DC26 DC29 DC30 DC32 Vacuum Cleaner Spring Wand Handle Catch'},
    {'title': 'Karcher Karcher Wet & Dry Vacuum Cleaners Cartridge Filter KAR/64145520'},
    {'title': 'Complete Vacuum Cleaner Hose Assembly Designed to Fit Dyson DC17 Animal, DC17 Asthma & Allergy, DC17 Total Clean'},
    {'title': 'Miele S8590 Alize Canister Vacuum Cleaner (Old Model)'},
    {'title': 'DerBlue Compatible/Replacement Parts 10 x Aero Vac Filters Kit for iRobot Roomba 500 600 Series 536 550 614 620 630 650 655 660 665 671 680 690 Vacuum Cleaner Accessory'},
    {'title': 'Genuine Kirby Vacuum Cleaner Brush Roll #313292S Fits Floor Care Kit, Polish Hardwood, Tile'},
    {'title': 'Oreck Model XL3600, 2000 Series Vacuum Cleaner Generic Threaded Motor Fan Housing'},
    {'title': 'Vacuum Cle