# 0 Background
- Notebook showcases semantic search for Wayfair's WAND product data set
- Leverages
    1. *SetenceTransformer* to get embedding
    2. *Langchain* integration to vector ChromaDB
    3. *PySpark* data processing
    4. *MLFlow* experimentation and model deployment workflow
    
- [source github](https://github.com/thomaschangsf/db-product-search)

# 1 Setup
- Installed pyspark dependencies: [spark ref](https://spark.apache.org/docs/latest/api/python/getting_started/install.html)

- start notebook via terminal cmd: pyspark 

In [3]:
# Create SparkSession from builder
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[1]") \
                    .appName('Scalling_ml_with_spark') \
                    .getOrCreate()

from pyspark.sql.types import *
import pyspark.sql.functions as fn

import os

spark

23/05/05 09:11:05 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [4]:
spark.sql("SHOW TABLES;")

23/05/05 09:11:09 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException


DataFrame[namespace: string, tableName: string, isTemporary: boolean]

# 2 Intro and Config.py
- Based on 00_Introl_and_Config.py

In [31]:
%cd ..

/Users/thomaschang/git/thomaschangsf/db-product-search


In [32]:
!echo {os.getcwd()}

/Users/thomaschang/git/thomaschangsf/db-product-search


In [4]:
os.getcwd()

'/Users/thomaschang/git/thomaschangsf/db-product-search'

In [5]:

if 'config' not in locals().keys():
  config = {}

config['database'] = 'wands'

# create database if not exists
_ = spark.sql('create database if not exists {0}'.format(config['database']))

# set current datebase context
_ = spark.catalog.setCurrentDatabase(config['database'])


# use this if we started notebook via venv/bin/pyspark
WORKDIR=f"{os.getcwd()}/work-dir"
print(f"WORKDIR={WORKDIR}")


# below is only true if we run from docker
# os.getenv('HOME') is /home/jovyan bc jupyter started with this command
# docker run -it --memory="28g" --memory-swap="30g"  -p 8888:8888 --mount type=bind,source=$(pwd),target=/home/jovyan adipolak/ml-with-apache-spark
# WORKDIR=f"{os.getenv('HOME')}/db-product-search/work-dir"


# DB
config['dbfs_path'] = f'{WORKDIR}/wands'
config['WANDS_DOWNLOADS_PATH'] = config['dbfs_path'] + '/downloads'
# SPARK-WAREHOUSE
SPARK_WAREHOUSE_DIR=f"{os.getcwd()}/spark-warehouse/wands.db"
config['WANDS_WAREHOUSE_PATH'] = SPARK_WAREHOUSE_DIR


# Models
config['WANDS_MODEL_PATH'] = WORKDIR + '/models'
config['basic_model_name'] = 'wands_basic_search'
config['tuned_model_name'] = 'wands_tuned_search'

# MFLOW
import mlflow
config['mlflow_path'] = f"{WORKDIR}/mlflow/experiments/"
mlflow.set_experiment(config['mlflow_path'])


config



WORKDIR=/Users/thomaschang/git/thomaschangsf/db-product-search/work-dir


{'database': 'wands',
 'dbfs_path': '/Users/thomaschang/git/thomaschangsf/db-product-search/work-dir/wands',
 'WANDS_DOWNLOADS_PATH': '/Users/thomaschang/git/thomaschangsf/db-product-search/work-dir/wands/downloads',
 'WANDS_WAREHOUSE_PATH': '/Users/thomaschang/git/thomaschangsf/db-product-search/spark-warehouse/wands.db',
 'WANDS_MODEL_PATH': '/Users/thomaschang/git/thomaschangsf/db-product-search/work-dir/models',
 'basic_model_name': 'wands_basic_search',
 'tuned_model_name': 'wands_tuned_search',
 'mlflow_path': '/Users/thomaschang/git/thomaschangsf/db-product-search/work-dir/mlflow/experiments/'}

# 3 Data Prep
- Based on 01_data_Prep.py

### 3.1 Download raw data 
- to config[WANDS_DOWNLOADS_PATH]

In [8]:
!rm -rf {config['WANDS_DOWNLOADS_PATH']}
!mkdir -p {config['WANDS_DOWNLOADS_PATH']}
!pushd {config['WANDS_DOWNLOADS_PATH']}

config

~/git/thomaschangsf/db-product-search/work-dir/wands/downloads ~/git/thomaschangsf/db-product-search


{'database': 'wands',
 'dbfs_path': '/Users/thomaschang/git/thomaschangsf/db-product-search/work-dir/wands',
 'WANDS_DOWNLOADS_PATH': '/Users/thomaschang/git/thomaschangsf/db-product-search/work-dir/wands/downloads',
 'WANDS_MODEL_PATH': '/Users/thomaschang/git/thomaschangsf/db-product-search/work-dir/models',
 'basic_model_name': 'wands_basic_search',
 'tuned_model_name': 'wands_tuned_search',
 'mlflow_path': '/Users/thomaschang/git/thomaschangsf/db-product-search/work-dir/mlflow/experiments/'}

In [11]:
%cd {config['WANDS_DOWNLOADS_PATH']}

!echo "Download label.csv"
!wget -q https://raw.githubusercontent.com/wayfair/WANDS/main/dataset/label.csv

!echo "Download product"
!wget -q https://raw.githubusercontent.com/wayfair/WANDS/main/dataset/product.csv

!echo "Download query"
!wget -q https://raw.githubusercontent.com/wayfair/WANDS/main/dataset/query.csv

%popd

/Users/thomaschang/git/thomaschangsf/db-product-search/work-dir/wands/downloads
Download label.csv
Download product
Download query


### 3.2 Read into spark

In [12]:
from pyspark.sql.types import *
import pyspark.sql.functions as fn

import os

#### Clean all previous tables

In [34]:
!ls {config['WANDS_WAREHOUSE_PATH']}

[34mlabels[m[m   [34mproducts[m[m [34mqueries[m[m


In [22]:
!rm -rf {os.getcwd()}/spark-warehouse/wands.db/*

#### Process Product
- saves to spark-warehouse/wands.db/products/*.parquet

In [9]:
products_schema = StructType([
  StructField('product_id', IntegerType()),
  StructField('product_name', StringType()),
  StructField('product_class', StringType()),
  StructField('category_hierarchy', StringType()),
  StructField('product_description', StringType()),
  StructField('product_features', StringType()),
  StructField('rating_count', FloatType()),
  StructField('average_rating', FloatType()),
  StructField('review_count', FloatType())
  ])

_ = (
  spark
    .read
      .csv(
        path=f"{config['WANDS_DOWNLOADS_PATH']}/product.csv",
        sep='\t',
        header=True,
        schema=products_schema
        )
    .write
      .format('parquet')
      #.format('delta')
      .mode('overwrite')
      .option('overwriteSchema','true')
      .saveAsTable('products')
  )

display(spark.table('products'))


[Stage 0:>                                                        (0 + 16) / 16]

23/05/05 09:12:12 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
23/05/05 09:12:12 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 84.44% for 9 writers
23/05/05 09:12:12 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 76.00% for 10 writers
23/05/05 09:12:12 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 69.09% for 11 writers
23/05/05 09:12:12 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 63.33% for 12 writers
23/05/05 09:12:12 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 58.46% for 13 writers
23/05/05 09:12:12 WARN MemoryManager: Total allocation exceeds 95.

                                                                                

23/05/05 09:12:14 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
23/05/05 09:12:14 WARN HiveConf: HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist
23/05/05 09:12:14 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
23/05/05 09:12:14 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist


DataFrame[product_id: int, product_name: string, product_class: string, category_hierarchy: string, product_description: string, product_features: string, rating_count: float, average_rating: float, review_count: float]

#### Process Query

In [10]:
queries_schema = StructType([
  StructField('query_id', IntegerType()),
  StructField('query', StringType()),
  StructField('query_class', StringType())
  ])

_ = (
  spark
    .read
    .csv(
      path=f"{config['WANDS_DOWNLOADS_PATH']}/query.csv",
      sep='\t',
      header=True,
      schema=queries_schema
      )
    .write
      .format('parquet')
      .mode('overwrite')
      .option('overwriteSchema','true')
      .saveAsTable('queries')
  )

display(
  spark.table('queries')
  )

DataFrame[query_id: int, query: string, query_class: string]

#### Process Labels

In [11]:
labels_schema = StructType([
  StructField('id', IntegerType()),
  StructField('query_id', IntegerType()),
  StructField('product_id', IntegerType()),
  StructField('label', StringType())
  ])

_ = (
  spark
    .read
    .csv(
      path=f"{config['WANDS_DOWNLOADS_PATH']}/label.csv",
      sep='\t',
      header=True,
      schema=labels_schema
      )
    .write
      .format('parquet')
      .mode('overwrite')
      .option('overwriteSchema','true')
      .saveAsTable('labels')
  )

display(spark.table('labels'))

DataFrame[id: int, query_id: int, product_id: int, label: string]

In [12]:
spark.table('labels').show(2)


+---+--------+----------+----------+
| id|query_id|product_id|     label|
+---+--------+----------+----------+
|  0|       0|     25434|     Exact|
|  1|       0|     12088|Irrelevant|
+---+--------+----------+----------+
only showing top 2 rows



In [13]:
if 'label_score' not in spark.table('labels').columns:
  _ = spark.sql('ALTER TABLE labels ADD COLUMN label_score FLOAT')

df_label = spark.table('labels')
df_label.show(2)

+---+--------+----------+----------+-----------+
| id|query_id|product_id|     label|label_score|
+---+--------+----------+----------+-----------+
|  0|       0|     25434|     Exact|       null|
|  1|       0|     12088|Irrelevant|       null|
+---+--------+----------+----------+-----------+
only showing top 2 rows



In [14]:
from pyspark.sql.functions import when,col


### Sql update does not work because it needs delta lake
#spark.sql("""
#UPDATE labels
#SET label_score = 
#  CASE lower(label)
#        WHEN 'Exact' THEN 1.0
#        WHEN 'Partial' THEN 0.75
#        WHEN 'Irrelevant' THEN 0.0
#        ELSE NULL
#        END;
#""")

df_label = df_label.withColumn("label_score", when(df_label.label == "Exact", 1.0)\
                                               .when(df_label.label == "Partial", 0.75)\
                                               .when(df_label.label == "Irrelevant", 0.0)\
                                               .when(df_label.label == "Exact", 1.0)\
                                               .otherwise(col("label_score")))

df_label.show(2)

+---+--------+----------+----------+-----------+
| id|query_id|product_id|     label|label_score|
+---+--------+----------+----------+-----------+
|  0|       0|     25434|     Exact|        1.0|
|  1|       0|     12088|Irrelevant|        0.0|
+---+--------+----------+----------+-----------+
only showing top 2 rows



# 4 Define Basic Search
- based on 02_Define_Basic_Searc.py


1. Create DF that joins product columns
2. Use sentence transformer to create embedding
3. Use langChain vector db connector to load
4. Analysis: correlation(cosine similarity, label score)

## 4.0 Load libraries

In [35]:
%pip install sentence-transformers langchain chromadb


Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting langchain
  Downloading langchain-0.0.157-py3-none-any.whl (727 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m727.6/727.6 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m:01[0m
[?25hCollecting chromadb
  Downloading chromadb-0.3.21-py3-none-any.whl (46 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.4/46.4 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting aiohttp<4.0.0,>=3.8.3 (from langchain)
  Downloading aiohttp-3.8.4-cp39-cp39-macosx_10_9_x86_64.whl (360 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m360.3/360.3 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m:01[0m
[?25hCollecting async-timeout<5.0.0,>=4.0.0 (from langchain)
  Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)
Collecting dataclasses-json<0.6.0,>=0.5.7 (from la

Collecting monotonic>=1.5 (from posthog>=2.4.0->chromadb)
  Downloading monotonic-1.6-py2.py3-none-any.whl (8.2 kB)
Collecting backoff>=1.10.0 (from posthog>=2.4.0->chromadb)
  Downloading backoff-2.2.1-py3-none-any.whl (15 kB)
Collecting transformers<5.0.0,>=4.6.0 (from sentence-transformers>=2.2.2->chromadb)
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m[0m eta [36m0:00:01[0m[36m0:00:01[0m
[?25hCollecting torch>=1.6.0 (from sentence-transformers>=2.2.2->chromadb)
  Downloading torch-2.0.0-cp39-none-macosx_10_9_x86_64.whl (139.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.8/139.8 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:02[0m
[?25hCollecting torchvision (from sentence-transformers>=2.2.2->chromadb)
  Downloading torchvision-0.15.1-cp39-cp39-macosx_10_9_x86_64.whl 

[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.2/536.2 kB[0m [31m967.1 kB/s[0m eta [36m0:00:00[0m1m927.3 kB/s[0m eta [36m0:00:01[0m
[?25hBuilding wheels for collected packages: hnswlib, sentence-transformers
  Building wheel for hnswlib (pyproject.toml) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mBuilding wheel for hnswlib [0m[1;32m([0m[32mpyproject.toml[0m[1;32m)[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[17 lines of output][0m
  [31m   [0m running bdist_wheel
  [31m   [0m running build
  [31m   [0m running build_ext
  [31m   [0m creating var
  [31m   [0m creating var/folders
  [31m   [0m creating var/folders/gj
  [31m   [0m creating var/folders/gj/mkq_sr7n3l3bvwj7xpp9flwc0000gp
  [31m   [0m creating var/folders/gj/mkq_sr7n3l3bvwj7xpp9flwc0000gp/T
  [31m   [0m clang -Wno-unused-result -Wsign-compare -Wunreachable-code -fno-comm

In [6]:
from sentence_transformers import SentenceTransformer

from langchain.document_loaders import DataFrameLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

import mlflow

import pandas as pd
print('DONE')

DONE


## 4.1 Aggregate Data

In [31]:
product_text_pd = (
  spark
    .table('products')
    .selectExpr(
      'product_id',
      'product_name',
      'COALESCE(product_description, product_name) as product_text' # use product description if available, otherwise name
      )
  ).toPandas()

display(product_text_pd)

Unnamed: 0,product_id,product_name,product_text
0,0,solid wood platform bed,"good , deep sleep can be quite difficult to ha..."
1,1,all-clad 7 qt . slow cooker,"create delicious slow-cooked meals , from tend..."
2,2,all-clad electrics 6.5 qt . slow cooker,prepare home-cooked meals on any schedule with...
3,3,all-clad all professional tools pizza cutter,this original stainless tool was designed to c...
4,4,baldwin prestige alcott passage knob with roun...,the hardware has a rich heritage of delivering...
...,...,...,...
42989,42989,malibu pressure balanced diverter fixed shower...,the malibu pressure balanced diverter fixed sh...
42990,42990,emmeline 5 piece breakfast dining set,emmeline 5 piece breakfast dining set
42991,42991,maloney 3 piece pub table set,this pub table set includes 1 counter height t...
42992,42992,fletcher 27.5 '' wide polyester armchair,"bring iconic , modern style to your space in a..."


## 4.2 Convert product_text into embeddings
- We will now convert our product text into embeddings.  The instructions for converting text into an embedding is captured in a language model.  The [*all-MiniLM-L12-v2* model](https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2) is a *mini language model* (in contrast to a large language model) which has been trained on a large, well-rounded corpus of input text for good, balanced performance in a variety of document search scenarios.  The benefit of the *mini* language model as compared to a *large* language is that the *mini* model generates a more succinct embedding structure that facilitates faster search and lower overall resource utilization.  Given the limited breadth of the content in a product catalog, this is the best option of our needs:

In [33]:
original_model = SentenceTransformer('all-MiniLM-L12-v2')


Downloading (…)5dded/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)4d81d5dded/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)81d5dded/config.json:   0%|          | 0.00/573 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)ded/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)5dded/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

Downloading (…)dded/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)4d81d5dded/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)1d5dded/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [54]:
embedding_model_path = f"{config['WANDS_MODEL_PATH']}/embedding_model"
print(f'embedding_model_path={embedding_model_path}')

#!rm -rf {embedding_model_path}/*
#!mkdir -p {embedding_model_path}

original_model.save(embedding_model_path)


embedding_model_path=/Users/chang/Documents/dev/git/gratia/03_system/datapipeilne/scaling-machine-learning-course/db-product-search/work-dir/models/embedding_model


## 4.3 Ingest into Vector DB

#### Reload the orignal sentence transformer using langchain wrapper

In [55]:
embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_path)


In [60]:
chromadb_path = f"{config['dbfs_path']}/chromadb"
!mkdir -p {chromadb_path}
chromadb_path

'/Users/chang/Documents/dev/git/gratia/03_system/datapipeilne/scaling-machine-learning-course/db-product-search/work-dir/wands/chromadb'

In [65]:
# Creat langcahin DataFrameLoader object
documents = (
  DataFrameLoader( #langchain.document_loaders.dataframe.DataFrameLoader
    product_text_pd,
    page_content_column='product_text'
    )
    .load()
  )

In [66]:
# Generate Embeddings from Product Info
# define logic for embeddings storage
vectordb = Chroma.from_documents(
  documents=documents, 
  embedding_model=embedding_model, 
  persist_directory=chromadb_path
  )




Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [67]:
# persist vector db to storage
# creates 2 parquet file; before we just had an index folder
vectordb.persist()

# DBTITLE 1,Count Items in Vector DB
vectordb._collection.count()

FloatProgress(value=0.0, layout=Layout(width='100%'), style=ProgressStyle(bar_color='black'))

42994

In [68]:
# Examine a Vector DB record
rec= vectordb._collection.peek(1)

print('Metadatas:  ', rec['metadatas'])
print('Documents:  ', rec['documents'])
print('ids:        ', rec['ids'])
print('embeddings: ', rec['embeddings'])

Metadatas:   [{'product_id': 41976, 'product_name': 'bradleigh 2 - drawer end table'}]
Documents:   ['large countertops , clear storage : spacious countertops for everyday ornaments , cups , books , etc . thick and stable load plate : made of durable and durable plates , the load is more secure . firmly edge-sealing , carefully built : edge-sealing fit , not easy to lift , creating a rich home atmosphere . lifting table : creative design , change if you want , you can also use it as a desk , read books , play games , extraordinary experience . bring four wheels : move freely , save effort , and go wherever you want .']
ids:         ['c2d0edc4-e9d1-11ed-a6bb-1e00d20ee484']
embeddings:  [[-0.0447920486330986, 0.008601327426731586, -0.018501171842217445, 0.02558579109609127, -0.047929875552654266, -0.022731784731149673, 0.00840948149561882, 0.07353478670120239, 0.012087791226804256, 0.052916303277015686, -0.0822484940290451, -0.027594074606895447, -0.02226649969816208, 0.02781116403639316

In [69]:
# Perform Simple Search
vectordb.similarity_search_with_score("kid-proof rug")

[(Document(page_content="children 's nylon educational and play area rug .", metadata={'product_id': 42543, 'product_name': 'hartland abc barnyard power loomed green/red area rug'}),
  0.4518544673919678),
 (Document(page_content='this modern and convenient rug combines contemporary colors , an elegant medallion pattern , a wood floor safe backing , and durable construction . the kid and pet safe materials easy to spot clean , making these rugs ideal for a busy family .', metadata={'product_id': 41810, 'product_name': 'nile tarifa bohemian medallion red area rug'}),
  0.5379310846328735),
 (Document(page_content='this modern and convenient rug combines contemporary colors , an elegant medallion pattern , a wood floor safe backing , and durable construction . the kid and pet safe materials easy to spot clean , making these rugs ideal for a busy family .', metadata={'product_id': 41811, 'product_name': 'nile tarifa bohemian medallion yellow area rug'}),
  0.5379310846328735),
 (Document(

In [70]:
vectordb.similarity_search_with_score("bicyclist")

[(Document(page_content='decorative motorcycle', metadata={'product_id': 17819, 'product_name': 'decorative motorcycle'}),
  1.1318777799606323),
 (Document(page_content="minimalist design that 's almost invisible features : smart , simple cycle storage for all bike styles . ideal for all bikes : children 's , ladies ' , gents ' and tandems . versatile - store bikes horizontally in left or right orientations . facility to lock the bike in place . perfect for the home , office or retail display . make efficient use of space with a tiered installation . suitable for clipless or platform pedals . discreet wheel rests protect walls from dirt and damage .", metadata={'product_id': 8767, 'product_name': 'hero bicycle wall mount bike rack'}),
  1.1497584581375122),
 (Document(page_content="minimalist design that 's almost invisible features : smart , simple cycle storage for all bike styles versatile - store bikes horizontally in left or right orientations . perfect for the home , office or r

## 4.4 Deploy Model Via MLFlow

#### Define Environment Requirements


In [71]:
import pandas
import langchain
import chromadb

# get base environment configuration
conda_env = mlflow.pyfunc.get_default_conda_env()

# define packages required by model
packages = [
  f'pandas=={pandas.__version__}',
  f'langchain=={langchain.__version__}',
  f'chromadb=={chromadb.__version__}'
  ]

# add required packages to environment configuration
conda_env['dependencies'][-1]['pip'] += packages

print(
  conda_env
  )

{'name': 'mlflow-env', 'channels': ['conda-forge'], 'dependencies': ['python=3.8.3', 'pip<=23.1.2', {'pip': ['mlflow', 'cloudpickle==2.2.1', 'pandas==2.0.1', 'langchain==0.0.157', 'chromadb==0.3.21']}]}


#### Define artifacts
1. Artifacts are assets stored with the model as it is logged with MLflow.  Using keys assigned to these artifacts, those assets can be retrieved for utilization at various points in the model's logic. 

2. The two artifacts needed for our model are the path to the saved model and the Chroma database, both of which were persisted to storage in previous steps.  Please note that these objects were saved to the *Databricks Filesystem* which MLflow understands how to reference.  As a result, we need to alter the paths to these items by replacing the local */dbfs* to *dbfs:*

In [72]:
#Identify Model Artifacts
artifacts = {
  'embedding_model': embedding_model_path,#.replace('/dbfs','dbfs:'), 
  'chromadb': chromadb_path#.replace('/dbfs','dbfs:')
  }

print(
  artifacts
  )

{'embedding_model': '/Users/chang/Documents/dev/git/gratia/03_system/datapipeilne/scaling-machine-learning-course/db-product-search/work-dir/models/embedding_model', 'chromadb': '/Users/chang/Documents/dev/git/gratia/03_system/datapipeilne/scaling-machine-learning-course/db-product-search/work-dir/wands/chromadb'}


#### Define Model Wrapper class
- In the Databricks environment, deployment typically takes place using [MLflow](https://www.databricks.com/product/managed-mlflow), which has the ability to build a containerized service from our model as one of its deployment patterns.  Generic Python models deployed with MLflow typically support a standard API with a *predict* method that's called for inference.  We will need to write a custom wrapper to map a standard interface to our model as follows:

In [73]:
class ProductSearchWrapper(mlflow.pyfunc.PythonModel):

  # define steps to initialize model
  # addresses the steps that need to take place at model initialization. Two of those steps make reference to artifacts within the model's context.
  def load_context(self, context):

    # import required libraries
    import pandas as pd
    from langchain.embeddings import HuggingFaceEmbeddings
    from langchain.vectorstores import Chroma

    # retrieve embedding model
    embedding_model = HuggingFaceEmbeddings(model_name=context.artifacts['embedding_model'])

    # retrieve vectordb contents
    self._vectordb = Chroma(
      persist_directory=context.artifacts['chromadb'],
      embedding_function=embedding_model
      )

    # set number of results to return
    self._max_results = 5


  # define steps to generate results
  # note: query_df expects only one query
  def predict(self, context, query_df):

    # import required libraries
    import pandas as pd 

    # perform search on embeddings
    raw_results = self._vectordb.similarity_search_with_score(
      query_df['query'].values[0], # only expecting one value at a time 
      k=self._max_results
      )

    # get lists of of scores, descriptions and ids from raw results
    scores, descriptions, names, ids = zip(
      *[(r[1], r[0].page_content, r[0].metadata['product_name'], r[0].metadata['product_id']) for r in raw_results]
      )

    # reorganized results as a pandas df, sorted on score
    results_pd = pd.DataFrame({
      'product_id':ids,
      'product_name':names,
      'product_description':descriptions,
      'score':scores
      }).sort_values(axis=0, by='score', ascending=True)
    
    # set return value
    return results_pd


#### Persist Model to MLFlow
1. Notice that in this scenario, our embedding model and Chroma database are being loaded as artifacts and that our *python_model* is just the class definition that provides the logic for hydrating a model from those artifacts:

2. If we use the experiments UI (accessible by clicking the flask icon in the right-hand navigation of your workspace), we can access the details surrounding the model we just logged.  By expanding the folder structure behind the model, we can see the model and vector store assets loaded into MLflow:



In [78]:
# serialize the model to mlruns/models/version_*
#.   each time I run the cmd below, it will create a new wands_basic_search versopm
with mlflow.start_run() as run:

    mlflow.pyfunc.log_model(
        artifact_path='model',
        python_model=ProductSearchWrapper(),
        conda_env=conda_env,
        artifacts=artifacts, # items at artifact path will be loaded into mlflow repository
        registered_model_name=config['basic_model_name']
    )

Registered model 'wands_basic_search' already exists. Creating a new version of this model...
2023/05/03 10:34:58 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: wands_basic_search, version 3
Created version '3' of model 'wands_basic_search'.


In [55]:
run.info

<RunInfo: artifact_uri='file:///Users/thomaschang/git/thomaschangsf/db-product-search/mlruns/126008453708962408/a666e11f719349ea95b39ed252cd4f6e/artifacts', end_time=None, experiment_id='126008453708962408', lifecycle_stage='active', run_id='a666e11f719349ea95b39ed252cd4f6e', run_name='dashing-ray-168', run_uuid='a666e11f719349ea95b39ed252cd4f6e', start_time=1683321621993, status='RUNNING', user_id='thomaschang'>

#### Elevate to production
- In mlruns/models/wands_basic_search/version-*/meta.yaml. 
```
    current_stage: None | Production
```
    * the models directory appears to be the "model" repository
- The next command will update this meta.yaml file's stage to production

- Loading our model, we can perform a simple test to see results from a sample search. 



In [94]:
client = mlflow.MlflowClient()

latest_version = client.get_latest_versions(config['basic_model_name'], stages=['None'])[0].version

client.transition_model_version_stage(
    name=config['basic_model_name'],
    version=latest_version,
    stage='Production',
    archive_existing_versions=True
)


<ModelVersion: aliases=[], creation_timestamp=1683135298551, current_stage='Production', description=None, last_updated_timestamp=1683136187082, name='wands_basic_search', run_id='4602daae68fc4f7fa4a86a0397f40dcf', run_link=None, source='file:///Users/chang/Documents/dev/git/gratia/03_system/datapipeilne/scaling-machine-learning-course/db-product-search/mlruns/625849527336453425/4602daae68fc4f7fa4a86a0397f40dcf/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=3>

#### Retrieve model from registry


In [95]:
# appears load_model has a context directory of your current experiment, probably set by mlflow.pyfunc.log_model
model = mlflow.pyfunc.load_model(f"models:/{config['basic_model_name']}/Production")




In [96]:
# Test Persisted Model with Sample Search
search = pd.DataFrame({'query':['farmhouse dining room table']})

# call model
display(model.predict(search))

Unnamed: 0,product_id,product_name,product_description,score
0,14562,rustic dining table,rustic dining table,1.022571
1,14783,industrial solid wood dining table,gather family and friends for good food and go...,1.030153
2,22297,norman dining table,norman dining table,1.064149
3,23646,lockard extendable dining table,anchor your dining room in modern farmhouse st...,1.092422
4,13935,marceline 40 '' console table,farmhouse inspired design will add a charming ...,1.094104


# 5 Fine Tune
- Based on 03_Fine_Tune_Model.py
- Having demonstrated the basics of assembling a model and supporting data to enable a semantic search, we will now focus on fine-tuning the model.  During fine-tuning, the model is fit against a set of data specific to a particular domain, such as our product catalog.  The original knowledge accumulated by our model from its pre-training remains intact but is supplemented with information gleaned from the additional data provided.  Once the model has been tuned to our satisfaction, it is packaged and persisted just like as before.

In [43]:
%pip install langchain

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting langchain
  Downloading langchain-0.0.157-py3-none-any.whl (727 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m727.6/727.6 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m:01[0m
Collecting aiohttp<4.0.0,>=3.8.3 (from langchain)
  Downloading aiohttp-3.8.4-cp39-cp39-macosx_10_9_x86_64.whl (360 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m360.3/360.3 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m:01[0m
[?25hCollecting async-timeout<5.0.0,>=4.0.0 (from langchain)
  Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)
Collecting dataclasses-json<0.6.0,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.5.7-py3-none-any.whl (25 kB)
Collecting numexpr<3.0.0,>=2.8.4 (from langchain)
  Downloading numexpr-2.8.4-cp39-cp39-macosx_10_9_x86_64.whl (99 kB)
[2K     [38;2;114;156;

In [15]:
from sentence_transformers import SentenceTransformer, util, InputExample, losses, evaluation
import torch
from torch.utils.data import DataLoader

from langchain.document_loaders import DataFrameLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

import numpy as np
import pandas as pd

import mlflow

## 5.1 Aggregate product, labels, and query

In [16]:
search_pd = (
  spark   
    .table('products')
    .selectExpr(
      'product_id',
      'product_name',
      'COALESCE(product_description, product_name) as product_text' # use product description if available, otherwise name
      )
    .join(
        df_label, #spark.table('labels'),
        on='product_id'
      )
    .join(
      spark
        .table('queries'),
        on='query_id'
      )
      .selectExpr('query','product_text','label_score as score')
  ).toPandas()

display(search_pd)

Unnamed: 0,query,product_text,score
0,parsons chairs,the executive chair features stylish chrome ac...,0.75
1,ergonomic chair,the executive chair features stylish chrome ac...,1.00
2,teal chair,the executive chair features stylish chrome ac...,0.75
3,togo chair,the executive chair features stylish chrome ac...,0.75
4,ruckus chair,the executive chair features stylish chrome ac...,0.00
...,...,...,...
233443,wood bar stools,this set of two barstools features a minimalis...,0.75
233444,wine bar,this set of two barstools features a minimalis...,0.00
233445,bar stool 24 inches height,this set of two barstools features a minimalis...,0.75
233446,bar stool with backrest,this set of two barstools features a minimalis...,0.75


## 5.2 Embed product, query, and label

In [17]:
original_model = SentenceTransformer('all-MiniLM-L12-v2')




In [47]:
query_embeddings = (
  original_model
    .encode(
      search_pd['query'].tolist()
      )
  )
print('DONE')

DONE


In [48]:
product_embeddings = (
  original_model
    .encode(
      search_pd['product_text'].tolist()
      )
  )
print('DONE')

DONE


#### Calculate cosine similarity between Queries and Products

In [50]:
original_cos_sim_scores = (
  util.pairwise_cos_sim(
    query_embeddings, 
    product_embeddings
    )
  )
print(original_cos_sim_scores)


tensor([0.3665, 0.6709, 0.4955,  ..., 0.5241, 0.5641, 0.6809])


#### Analyze correlation between cosine similarity and human relevancy score

In [51]:
original_corr_coef_score = (
  np.corrcoef(
    original_cos_sim_scores,
    search_pd['score'].values
  )[0][1]
) 
# print results
print(original_corr_coef_score)

0.432595382706685


## 5.3 Fine Tune Model
- With a baseline measurement of the original model's performance in-hand, we can now fine-tune it using our annotated search result data.  We will start by restructuring our query results into a list of inputs as required by the model:

#### Restructure Data for Model Input

In [24]:
# 233448
search_pd.count

<bound method DataFrame.count of                                query   
0                     parsons chairs  \
1                    ergonomic chair   
2                         teal chair   
3                         togo chair   
4                       ruckus chair   
...                              ...   
233443               wood bar stools   
233444                      wine bar   
233445    bar stool 24 inches height   
233446       bar stool with backrest   
233447  overstreet rustic pub stools   

                                             product_text  score  
0       the executive chair features stylish chrome ac...   0.75  
1       the executive chair features stylish chrome ac...   1.00  
2       the executive chair features stylish chrome ac...   0.75  
3       the executive chair features stylish chrome ac...   0.75  
4       the executive chair features stylish chrome ac...   0.00  
...                                                   ...    ...  
233443  this set 

In [26]:
search_pd_small = search_pd.sample(2334)
search_pd_small.count

<bound method DataFrame.count of                                            query   
36139                          bathroom lighting  \
125119                      home sweet home sign   
41237                                 card table   
26554                             entrance table   
67036                            turquoise chair   
...                                          ...   
163066  48 inch bathroom vanity with trough sink   
125658                   cloud modular sectional   
9235                               rooster decor   
195967                 memory foam rug galveston   
166549                             hardwood beds   

                                             product_text  score  
36139   lenora 's duke 's collection conveys the elega...   0.00  
125119  this welcome sign for the front door will add ...   0.75  
41237   linearity and comfort are in no way opposites ...   0.75  
26554   anchor your seating ensemble with this contemp...   0.00  
67036  

In [28]:
# define function to assemble an input
def create_input(doc1, doc2, score):
  return InputExample(texts=[doc1, doc2], label=score)

# convert each search result into an input
# !!! Use smaller dataset; not search_pd
inputs = search_pd_small.apply(
  lambda s: create_input(s['query'], s['product_text'], s['score']), axis=1
  ).to_list()

inputs

print('DONE')

DONE


#### Train and tune a new model
- During model fitting, you will notice we are setting the model to perform just one pass (epoch) over the data.  We will actually see pretty sizeable improvements from this process, but we may wish to increase that value to get multiple passes if we want to explore getting more.  The setting for *warmup_steps* is just a common one used in this space.  Feel free to experiment with other values or take the default.

In [30]:
tuned_model = SentenceTransformer('all-MiniLM-L12-v2')

# define instructions for feeding inputs to model
input_dataloader = DataLoader(inputs, shuffle=True, batch_size=16) # feed 16 records at a time to the model

# define loss metric to optimize for
loss = losses.CosineSimilarityLoss(tuned_model)

# tune the model on the input data
tuned_model.fit(
  train_objectives=[(input_dataloader, loss)],
  epochs=1, # just make 1 pass over data
  warmup_steps=5 # 100 orig; controls how many steps over which learning rate increases to max before descending back to zero
  )


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/146 [00:00<?, ?it/s]

#### Estimate Tune Model Performance

In [31]:
query_embeddings = (
  tuned_model
    .encode(
      search_pd['query'].tolist()
      )
  )
print('DONE')

DONE


In [32]:
product_embeddings = (
  tuned_model
    .encode(
      search_pd['product_text'].tolist()
      )
  )
print('DONE')


DONE


In [33]:


# determine cosine similarity for each query-product pair
tuned_cos_sim_scores = (
  util.pairwise_cos_sim(
    query_embeddings, 
    product_embeddings
    )
  )

tuned_cos_sim_score = torch.mean(tuned_cos_sim_scores).item()

# display result
print(f"With tuning, avg cosine similarity went from {original_cos_sim_score} to {tuned_cos_sim_score}")


NameError: name 'original_cos_sim_score' is not defined

## 5.4 Deploy Fine Tuned Model

### 5.4.1 Ingest into Vector DB

#### Serialize fined tuned model

In [34]:
# encoder path
embedding_model_path = f"/{config['WANDS_MODEL_PATH']}/tuned_model"
print(f'embedding_model_path={embedding_model_path}')

# make sure path is clear
# !rm -rf {embedding_model_path}
# !mkdir -p {embedding_model_path}

# reload model using langchain wrapper
tuned_model.save(embedding_model_path)
embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_path)


embedding_model_path=//Users/thomaschang/git/thomaschangsf/db-product-search/work-dir/models/tuned_model


#### Get Product Text to Search

In [35]:
product_text_pd = (
  spark
    .table('products')
    .selectExpr(
      'product_id',
      'product_name',
      'COALESCE(product_description, product_name) as product_text' # use product description if available, otherwise name
      )
  ).toPandas()

product_text_pd.count

#### Persist product vector to Chroma DB

##### Side note on chromadb and mac intel x86 architectures
- install chromadb on mac with intel 86 architectures
	- chromadb uses hnswlib, which is built natively to support mac arm architectures
	- Since my work laptop is the x86 intel architecture, the following commands are necessary 
```
terminal: arch -x86_64 zsh
export HNSWLIB_NO_NATIVE=1
venv/bin/install chromadb
```

In [41]:
# assemble product documents in required format (id, text)
documents = (
  DataFrameLoader(
    product_text_pd,
    page_content_column='product_text'
    )
    .load()
  )
documents

[Document(page_content='the executive chair features stylish chrome accents and black faux leather trimming that you can sit , work and relax in comfort . stylish chrome arms and a rolling base put forward a professional , crisp air of sophistication . adjust the height , tilt , and swivel for ultimate personalization of your comfortable work-space . seat and back tilt-angles are adjustable , with tension control and tilt lock . the seat swivels 360 degrees . the padded back and seat of this executive chair provide built-in lower back and head support .', metadata={'product_id': 5555, 'product_name': 'amiracle executive chair'}),
 Document(page_content='the executive chair features stylish chrome accents and black faux leather trimming that you can sit , work and relax in comfort . stylish chrome arms and a rolling base put forward a professional , crisp air of sophistication . adjust the height , tilt , and swivel for ultimate personalization of your comfortable work-space . seat and 

In [37]:
chromadb_path = f"{config['dbfs_path']}/chromadb"
!mkdir -p {chromadb_path}
chromadb_path

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


'/Users/thomaschang/git/thomaschangsf/db-product-search/work-dir/wands/chromadb'

In [44]:
import langchain
# generate embeddings
vectordb = Chroma.from_documents(
  documents=documents, 
  embedding_model=embedding_model, 
  persist_directory=chromadb_path
  )

# persist vector db to chromadb_path
vectordb.persist()

print('DONE')



Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

FloatProgress(value=0.0, layout=Layout(width='100%'), style=ProgressStyle(bar_color='black'))

DONE


#### Examine a Vector DB record

In [45]:
rec = vectordb._collection.peek()

print('Document:    ', rec['documents'])
print('ids:    ', rec['ids'])
print('embeddings:    ', rec['embeddings'])


Document:     ['with its sought-after design , our lounge chair in dyed italian tanned leather upholstery lends classic looks , crafting , and quality to your well-appointed living space . the buttery full-aniline ( cognac tan ) leather features scuffs and authentic , natural hide markings that patina beautifully over time-while the rich semi-aniline ( onyx black ) leather embodies softness and ease . pirelli webbing supports high-density foam seat cushions layered in a super soft feather down topper . built for durability , the wood frame is reinforced with corner blocking , and its solid wood legs in mahogany finish draw the eye . while some trends come and go , this deep chair ’ s handsome tufted seat back and arms convey a timeless quality you ’ ll adore always .', 'the zline is a 36 in . designer series wooden wall mount range hood with a modern design and built-to-last quality that would make it a great addition to any home or kitchen remodel . the gray hand-finished wood is made

#### Perform similarity search

In [47]:
vectordb.similarity_search_with_score('bicyclist')

[(Document(page_content='decorative motorcycle', metadata={'product_id': 17819, 'product_name': 'decorative motorcycle'}),
  1.1318776607513428),
 (Document(page_content="minimalist design that 's almost invisible features : smart , simple cycle storage for all bike styles . ideal for all bikes : children 's , ladies ' , gents ' and tandems . versatile - store bikes horizontally in left or right orientations . facility to lock the bike in place . perfect for the home , office or retail display . make efficient use of space with a tiered installation . suitable for clipless or platform pedals . discreet wheel rests protect walls from dirt and damage .", metadata={'product_id': 8767, 'product_name': 'hero bicycle wall mount bike rack'}),
  1.1497584581375122),
 (Document(page_content="minimalist design that 's almost invisible features : smart , simple cycle storage for all bike styles versatile - store bikes horizontally in left or right orientations . perfect for the home , office or r

### 5.4.2 Deploy Model via MLFLOW

#### Define wrapper class for model

In [48]:
class ProductSearchWrapper(mlflow.pyfunc.PythonModel):


  # define steps to initialize model
  def load_context(self, context):

    # import required libraries
    import pandas as pd
    from langchain.embeddings import HuggingFaceEmbeddings
    from langchain.vectorstores import Chroma

    # retrieve embedding model
    embedding_model = HuggingFaceEmbeddings(model_name=context.artifacts['embedding_model'])

    # retrieve vectordb contents
    self._vectordb = Chroma(
      persist_directory=context.artifacts['chromadb'],
      embedding_function=embedding_model
      )

    # set number of results to return
    self._max_results = 5


  # define steps to generate results
  # note: query_df expects only one query
  def predict(self, context, query_df):


    # import required libraries
    import pandas as pd

    # perform search on embeddings
    raw_results = self._vectordb.similarity_search_with_score(
      query_df['query'].values[0], # only expecting one value at a time 
      k=self._max_results
      )

    # get lists of of scores, descriptions and ids from raw results
    scores, descriptions, names, ids = zip(
      *[(r[1], r[0].page_content, r[0].metadata['product_name'], r[0].metadata['product_id']) for r in raw_results]
      )

    # reorganized results as a pandas df, sorted on score
    results_pd = pd.DataFrame({
      'product_id':ids,
      'product_name':names,
      'product_description':descriptions,
      'score':scores
      }).sort_values(axis=0, by='score', ascending=True)
    
    # set return value
    return results_pd

#### Define Artifacts and Environments

In [49]:
# ---------------------------------------
# Define artifacts
# ---------------------------------------
artifacts = {
  'embedding_model': embedding_model_path, 
  'chromadb': chromadb_path
}
print(f"artifacts={artifacts}")


# ---------------------------------------
# Define Environment Requirements
# ---------------------------------------
import pandas
import langchain
import chromadb

# get base environment configuration
conda_env = mlflow.pyfunc.get_default_conda_env()

# define packages required by model
packages = [
  f'pandas=={pandas.__version__}',
  f'langchain=={langchain.__version__}',
  f'chromadb=={chromadb.__version__}'
  ]

# add required packages to environment configuration
conda_env['dependencies'][-1]['pip'] += packages

print(f"conda_env={conda_env}")


artifacts={'embedding_model': '//Users/thomaschang/git/thomaschangsf/db-product-search/work-dir/models/tuned_model', 'chromadb': '/Users/thomaschang/git/thomaschangsf/db-product-search/work-dir/wands/chromadb'}
conda_env={'name': 'mlflow-env', 'channels': ['conda-forge'], 'dependencies': ['python=3.9.6', 'pip<=23.1.2', {'pip': ['mlflow', 'cloudpickle==2.2.1', 'pandas==2.0.1', 'langchain==0.0.157', 'chromadb==0.3.21']}]}


#### Persist model

In [50]:
with mlflow.start_run() as run:

    mlflow.pyfunc.log_model(
        artifact_path='model', 
        python_model=ProductSearchWrapper(),
        conda_env=conda_env,
        artifacts=artifacts, # items at artifact path will be loaded into mlflow repository
        registered_model_name=config['tuned_model_name']
    )

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Successfully registered model 'wands_tuned_search'.
2023/05/05 14:20:22 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: wands_tuned_search, version 1
Created version '1' of model 'wands_tuned_search'.


#### Elevate model to Production

In [51]:
client = mlflow.MlflowClient()

latest_version = client.get_latest_versions(config['tuned_model_name'], stages=['None'])[0].version

client.transition_model_version_stage(
    name=config['tuned_model_name'],
    version=latest_version,
    stage='Production',
    archive_existing_versions=True
)


<ModelVersion: aliases=[], creation_timestamp=1683321622872, current_stage='Production', description=None, last_updated_timestamp=1683321630156, name='wands_tuned_search', run_id='a666e11f719349ea95b39ed252cd4f6e', run_link=None, source='file:///Users/thomaschang/git/thomaschangsf/db-product-search/mlruns/126008453708962408/a666e11f719349ea95b39ed252cd4f6e/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=1>

#### Retrieve model from registry


In [52]:
model = mlflow.pyfunc.load_model(f"models:/{config['tuned_model_name']}/Production")



#### Smoke test

In [54]:
search = pd.DataFrame({'query':['farmhouse dining room table']})

display(model.predict(search))

Unnamed: 0,product_id,product_name,product_description,score
0,42783,fuller solid wood 3 legs end table,an approachable contemporary take on a minimal...,1.162406
1,42384,frankie dining table,bring curated contemporary style to your dinin...,1.165267
2,23688,abbas buffet table,combining midcentury modern forms with rustic ...,1.165399
3,2583,chalus extendable dining table,walking into the dining room that is occupied ...,1.170633
4,32147,leamont dining table,anchor your dining space in cottage style with...,1.177125
