#### create a new conda env before running this notebook

In [4]:
# !wget https://raw.githubusercontent.com/jina-ai/workshops/main/memes/requirements.txt -qN
# !pip install -r requirements.txt
# !pip install pandas 
# !pip install sentence-transformers

In [5]:
from jina import Document, DocumentArray
import pandas as pd 
import random

In [23]:
def prep_docs(input_file : str, num_size = -1, shuffle=True):
    docs = DocumentArray()
    error = []
    print(f"Processing {input_file}")

    df = pd.read_csv(input_file)
    df = df.dropna().drop_duplicates()
    n = df.shape[0]
    

    if shuffle:
        df = df.sample(frac = 1)
    
    num_size = n if num_size == -1 else num_size

    for idx in range(n):
        pname = df.iloc[idx, 0]
        pcate = df.iloc[idx, 1]
        pdesc = df.iloc[idx, 2]
        doctext = pname.replace('\n',' ').strip()
        if len(doctext) > 0:
            doc = Document(text=doctext)
            doc.tags['category'] = pcate.replace('\n',' ').strip()
            doc.tags['description'] = pdesc.replace('\n',' ').strip()
            docs.extend([doc])

    return docs[:num_size]

#### download the all_products.csv file from github

In [8]:
# !wget https://raw.githubusercontent.com/tactlabs/amazon-product-collector/main/amazon_crawler/all_products.csv?token=AOTFQFABYZQLFN54AOQIABTBKAGDM -qN

#### Trying for only 1000 products

In [25]:
docs = prep_docs(input_file= "all_products.csv", num_size = 1000, shuffle=True)
docs

Processing all_products.csv


<jina.types.arrays.document.DocumentArray length=1000 at 140188630323408>

In [27]:
docs[0].text, docs[0].tags['category']

("Carter's Baby Boys 2-Pack Pull-On Pants (Black/Heather, 12 Months)",
 'clothing')

In [30]:
# # [len(i.text) for i in docs ]
for i in docs:
    if len(i.text) == 0:
        print(i.text)

In [31]:
model = "sentence-transformers/paraphrase-distilroberta-base-v1" # Any model from Huggingface

In [34]:
from jina import Flow

In [35]:
flow = (
    Flow()
    .add(
        name="error_text_encoder",
        uses="jinahub://TransformerTorchEncoder",
        uses_with={"pretrained_model_name_or_path": model},
    )
    .add(
        name="error_text_indexer",
        uses='jinahub://SimpleIndexer',
    )
)


In [36]:
flow

In [37]:
# docs = docs[:10]

In [38]:
!rm -rf workspace # Remove workspace in case we've indexed stuff before

In [39]:
with flow:
    flow.index(
        inputs=docs,
  )

Output()

Output()

Output()

Output()




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=798293.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456356.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355881.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=239.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1352.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=328515953.0, style=ProgressStyle(descri…


[32merror_text_encoder@5462[L]:ready and listening[0m
[32merror_text_indexer@5462[L]:ready and listening[0m
[32m        gateway@5462[L]:ready and listening[0m
           Flow@5462[I]:🎉 Flow is ready to use!
	🔗 Protocol: 		[1mGRPC[0m
	🏠 Local access:	[4m[36m0.0.0.0:46885[0m
	🔒 Private network:	[4m[36m192.168.1.42:46885[0m[0m


In [40]:
# dir(flow)
flow.inspect()

In [41]:
docs[0].text

"Carter's Baby Boys 2-Pack Pull-On Pants (Black/Heather, 12 Months)"

In [66]:
query_doc = Document(text= 'boys')
# query_doc = Document(text= docs[0].text)

In [67]:
with flow:
    response = flow.search(inputs=query_doc, return_results=True)

[32merror_text_encoder@5462[L]:ready and listening[0m
[32merror_text_indexer@5462[L]:ready and listening[0m
[32m        gateway@5462[L]:ready and listening[0m
           Flow@5462[I]:🎉 Flow is ready to use!
	🔗 Protocol: 		[1mGRPC[0m
	🏠 Local access:	[4m[36m0.0.0.0:44337[0m
	🔒 Private network:	[4m[36m192.168.1.42:44337[0m[0m


In [68]:
matches = response[0].docs[0].matches

In [69]:
matches

<jina.types.arrays.match.MatchArray length=20 at 140188581399232>

In [70]:
for ind, i in enumerate(matches):
    print(f' product number : {ind} | product category : {i.tags["category"]} '.center(60,'='))
    print(i.text)
    print()

===== product number : 0 | product category : clothing =====
Hanes Boys' P480

===== product number : 1 | product category : footwear =====
Under Armour Boys UA Highlight RM Jr. Football Cleats

===== product number : 2 | product category : clothing =====
adidas Boys Separates Training Track Jacket (Big Kids)

===== product number : 3 | product category : footwear =====
Nike Air Jordan Big Boys (8-20) Short-Sleeve Jumpman T-Shirt Jordan

===== product number : 4 | product category : clothing =====
The Children's Place Boys' Basic Jogger

===== product number : 5 | product category : clothing =====
Under Armour Boys Lenticular Shadow Logo Short Sleeve Tee

===== product number : 6 | product category : clothing =====
Southpole Boys' Big Jogger Pants in Basic Stretch Twill Fabric

===== product number : 7 | product category : clothing =====
Champion Boys Long Sleeve Classic Hooded Tee Shirt Kids Clothes

===== product number : 8 | product category : clothing =====
Under Armour Boys' Armou