## upload2milvus

In [13]:
from pymilvus import (
    connections,
    utility,
    FieldSchema,
    CollectionSchema,
    DataType,
    Collection,
)
import pandas as pd
import torch
from transformers import AutoModel, AutoTokenizer, RobertaPreTrainedModel
from datasets import Dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
from sklearn.preprocessing import normalize
import random
import numpy as np
from itertools import chain
from conf.dao_config import simcse_recipe_ingredients_dir,simcse_recipe_title_dir,MILVUS_HOST

In [3]:
recipes = pd.read_csv("recipe/recipe.csv")

### upload ingredients vector

In [5]:
data = Dataset.from_pandas(recipes[["ingredients"]])

In [6]:
dataloader = DataLoader(data,batch_size=32)

In [8]:
tokenizer = AutoTokenizer.from_pretrained(simcse_recipe_ingredients_dir)
model = AutoModel.from_pretrained(simcse_recipe_ingredients_dir)
model.eval()

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0): RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Drop

In [None]:
embedding = []
with torch.no_grad():
    for dataset in tqdm(dataloader):
        data_feature = tokenizer(dataset['ingredients'],padding="max_length",max_length=128,truncation=True,return_tensors="pt")
        data_embedding = model(**data_feature)
        embedding.append(data_embedding.pooler_output)

In [16]:
embedding = list(chain.from_iterable(embedding))
embedding = [i.detach().numpy() for  i in embedding]

In [14]:
connections.connect("default",host=MILVUS_HOST, port="19530")

In [12]:
if utility.has_collection("COMP9900"):
    drop_collection("COMP9900", timeout=None, using='default')

True

In [12]:
fields = [
    FieldSchema(name="index", dtype=DataType.INT64, is_primary=True),
    FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=768)
]

schema = CollectionSchema(fields, "COMP9900")
hello_milvus = Collection("COMP9900", schema)

In [15]:
utility.list_collections()

['recipe_title_search', 'recipe_image_search', 'COMP9900']

In [None]:
# entity = {"embeddings":embedding}
entity = [
    [i+1 for i in range(len(embedding))],
    embedding]

In [None]:
insert_result = hello_milvus.insert(entity)

In [None]:
index_params = {
  "metric_type":"IP",
  "index_type":"IVF_FLAT",
  "params":{"nlist":1024}
}

collection.create_index(
  field_name="embeddings", 
  index_params=index_params
)

Status(code=0, message='')

### upload title vector

In [16]:
data = Dataset.from_pandas(recipes[["title"]])

In [17]:
dataloader = DataLoader(data,batch_size=32)

In [18]:
tokenizer = AutoTokenizer.from_pretrained(simcse_recipe_title_dir)
model = AutoModel.from_pretrained(simcse_recipe_title_dir)
model.eval()

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0): RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Drop

In [None]:
embedding = []
with torch.no_grad():
    for dataset in tqdm(dataloader):
        data_feature = tokenizer(dataset['title'],padding="max_length",max_length=128,truncation=True,return_tensors="pt")
        data_embedding = model(**data_feature)
        embedding.append(data_embedding.pooler_output)

  0%|          | 5/1023 [00:08<29:38,  1.75s/it]

In [16]:
embedding = list(chain.from_iterable(embedding))
embedding = [i.detach().numpy() for  i in embedding]

In [14]:
connections.connect("default",host=MILVUS_HOST, port="19530")

In [12]:
if utility.has_collection("recipe_title_search"):
    drop_collection("recipe_title_search", timeout=None, using='default')

True

In [12]:
fields = [
    FieldSchema(name="index", dtype=DataType.INT64, is_primary=True),
    FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=768)
]

schema = CollectionSchema(fields, "recipe_title_search")
hello_milvus = Collection("recipe_title_search", schema)

In [15]:
utility.list_collections()

['recipe_title_search', 'recipe_image_search', 'COMP9900']

In [None]:
# entity = {"embeddings":embedding}
entity = [
    [i+1 for i in range(len(embedding))],
    embedding]

In [None]:
insert_result = hello_milvus.insert(entity)

In [None]:
index_params = {
  "metric_type":"IP",
  "index_type":"IVF_FLAT",
  "params":{"nlist":1024}
}

collection.create_index(
  field_name="embeddings", 
  index_params=index_params
)

Status(code=0, message='')

### upload CLIP title vector

In [1]:
import clip
from PIL import Image
import requests
from io import BytesIO
import pandas as pd
from pymilvus import (
    connections,
    utility,
    FieldSchema,
    CollectionSchema,
    DataType,
    Collection,
)
import pandas as pd
import torch
from transformers import AutoModel, AutoTokenizer, RobertaPreTrainedModel
from datasets import Dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
from sklearn.preprocessing import normalize
import random
import numpy as np
from itertools import chain

In [33]:
model, preprocess = clip.load("ViT-B/32")

100%|███████████████████████████████████████| 338M/338M [01:25<00:00, 4.13MiB/s]


In [None]:
data = Dataset.from_pandas(recipes[["title"]])

In [35]:
dataloader = DataLoader(data,batch_size=64)

In [39]:
text_model.eval
embedding = []

with torch.no_grad():
    for dataset in tqdm(dataloader):
        inputs = clip.tokenize(dataset['title'])
        data_embedding = model.encode_text(inputs)
        embedding.append(data_embedding)

100%|██████████| 512/512 [07:22<00:00,  1.16it/s]


In [40]:
embedding = list(chain.from_iterable(embedding))

embedding = [i.detach().numpy() for  i in embedding]

In [41]:
connections.connect("default",host="120.55.40.153", port="19530")

In [42]:
if utility.has_collection("recipe_image_search"):
    utility.drop_collection("recipe_image_search")

True

In [45]:
fields = [
    FieldSchema(name="index", dtype=DataType.INT64, is_primary=True),
    FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=embedding[0].shape[0])
]

schema = CollectionSchema(fields, "recipe_image_search")
hello_milvus = Collection("recipe_image_search", schema)

In [48]:
index_params = {
  "metric_type":"IP",
  "index_type":"IVF_FLAT",
  "params":{"nlist":1024}
}

hello_milvus.create_index(
  field_name="embeddings", 
  index_params=index_params
)

Status(code=0, message='')

In [108]:
hello_milvus = Collection("recipe_image_search", schema)

In [46]:
entity = [
    [i+1 for i in range(len(embedding))],
    embedding]

In [47]:
insert_result = hello_milvus.insert(entity)