<a href="https://colab.research.google.com/github/vishalmysore/vectorx/blob/main/notebook/myscale_cookgpt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install clickhouse-connect openai sentence-transformers torch requests pandas tqdm

In [None]:
!pip install datasets

In [None]:
import torch
from sentence_transformers import SentenceTransformer
# set device to GPU if available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# load the retriever model from huggingface model hub
retriever = SentenceTransformer('all-minilm-l6-v2', device=device)

In [None]:
from datasets import load_dataset

dataset = load_dataset("VishalMysore/newIndianCuisine")

In [None]:
import pandas as pd
dataset = load_dataset("VishalMysore/newIndianCuisine")
data_raw = pd.DataFrame(dataset['train'])
# Display information about the cleaned DataFrame
print(data_raw.info())

In [None]:
from tqdm.auto import tqdm

summary_raw = data_raw['Method'].values.tolist()
method_feature = []

for i in tqdm(range(0, len(summary_raw), 1)):
    i_end = min(i+1, len(summary_raw))
    # generate embeddings for summary
    emb = retriever.encode(summary_raw[i:i_end]).tolist()[0]
    method_feature.append(emb)

data_raw['method_feature'] = method_feature

In [None]:
import clickhouse_connect
from google.colab import userdata

  # initialize client
client = clickhouse_connect.get_client(
      host='msc-8cdd15a4.us-east-1.aws.myscale.com',
      port=443,
      username='vishalmysore_org_default',
      password=userdata.get('myscale')
  )

In [None]:
# create table for bitcoin texts
client.command("DROP TABLE IF EXISTS default.myscale_cookgpt")

client.command("""
CREATE TABLE default.myscale_cookgpt
(
    id UInt64,
    Recipe String,
    "Total Time" String,
    Method String,
    Category String,
    Ingredients String,
    method_feature Array(Float32),
    CONSTRAINT vector_len CHECK length(method_feature) = 384
)
ORDER BY id
""")

In [None]:
client.insert("default.myscale_cookgpt",
              data_raw.to_records(index=False).tolist(),
              column_names=data_raw.columns.tolist())

In [None]:
client.command("""
ALTER TABLE default.myscale_cookgpt
ADD VECTOR INDEX method_feature_index method_feature
TYPE MSTG
('metric_type=Cosine')
""")

In [None]:
get_index_status="SELECT status FROM system.vector_indices WHERE name='method_feature_index'"
print(f"index build status: {client.command(get_index_status)}")

In [None]:
question = 'what recipe is made with Paneer?'
emb_query = retriever.encode(question).tolist()

In [None]:
top_k = 2
results = client.query(f"""
SELECT Recipe, Method, distance(method_feature, {emb_query}) as dist
FROM default.myscale_cookgpt

ORDER BY dist LIMIT {top_k}
""")

summaries = []
for res in results.named_results():
    summaries.append((res["Recipe"], res["Method"]))

print(summaries)