In [1]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import scipy.stats
import redis
from redis.commands.search.query import Query



In [2]:
from sentence_transformers import SentenceTransformer

embedder = SentenceTransformer('keepitreal/vietnamese-sbert')

  from .autonotebook import tqdm as notebook_tqdm
  return torch._C._cuda_getDeviceCount() > 0


In [3]:
# Item from 3 session nearest and its weight (save in database)
q = 0.2
v = 0.3
r = 0.5
itemSessionVector = [([0.1,0.25,0.13], q),([0.9,-0.2,0.125], q), ([0.5, -0.6, 0.254], v), ([0.1, 0.36, -0.9], r)]

# Item from real-time search (top 20 item similar with search query - use Bert to find)
# array = [(itemBert1, itemID1), (itemBert2, itemID2)]
itemRealSearch = [([0.8,0.12,0.5], "130542248"),([0.1,-0.5,-0.2], "154852")]


In [3]:
client = redis.Redis(host = 'localhost', port=6379, decode_responses=True)
INDEX_NAME = 'idx:product-vss'

In [5]:
# Item from 3 session nearest and its weight (save in database)
q = 0.2 # weight of click item
v = 0.3 # weight of favorite item
r = 0.5 # weight of buy item

In [6]:
product39001 = client.json().get('ecommerce:product:39001')
product122782 = client.json().get('ecommerce:product:122782')
product140968 = client.json().get('ecommerce:product:140968')
product172950 = client.json().get('ecommerce:product:172950')
product175952 = client.json().get('ecommerce:product:175952')
product262528 = client.json().get('ecommerce:product:262528')
product288988 = client.json().get('ecommerce:product:288988')
product911224 = client.json().get('ecommerce:product:911224')
itemSessionVector = [(product39001['description_embeddings'], q),
                        (product122782['description_embeddings'], q),
                        (product140968['description_embeddings'], r),
                        (product172950['description_embeddings'], q),
                        (product175952['description_embeddings'], v)]

# itemRealSearch = [(product262528['description_embeddings'], str(product262528['id'])),
#                     (product262528['description_embeddings'], str(product262528['id'])),]



In [12]:
queries = ['Xe đạp thể thao','Nồi cơm điện']
encoded_queries = embedder.encode(queries)

In [None]:
query_1 = (
    Query(f'@name:{queries[0]}| @summary:{queries[0]}').return_fields('id', 'name','description_embeddings' )
)
docs = client.ft(INDEX_NAME).search(query_1).docs

for doc in docs:
    doc_id = doc.id
    print(doc_id)
    doc_name = doc.name
    print(doc_name)
    
    doc_details = client.json().get(f'{doc_id}', '$.description_embeddings')
    print(doc_details)
        


In [None]:
itemRealSearch = []
docs = []


for query_text in queries:
    # Define the query to search by name and return the desired fields
    search_query = (
        Query(f'@name:{query_text}')
        .return_fields('id', 'name')
        .paging(0, 3)  # Limit the results to the top 3 if necessary
    )
    
    # Execute the search query
    
    docs = client.ft(INDEX_NAME).search(search_query).docs
    
    # Process the search results and store them in the itemRealSearch list
    for doc in docs:
        print(doc.name)
        description_embeddings = client.json().get(f'{doc.id}', '$.description_embeddings')
        item_id = str(doc.id)
        itemRealSearch.append((item_id, description_embeddings[0]))

# Print the results
for item in itemRealSearch:
    print(item)

# Constructing a "Pure KNN" VSS Query

In [18]:
queries = ['Nồi cơm điện']
encoded_queries = embedder.encode(queries)

In [19]:
query = (
        Query('(*)=>[KNN 20 @vector_name $query_vector AS vector_score]')
                .sort_by('vector_score')
                .return_fields('vector_score', 'id', 'name', 'summary')
                .dialect(2)
        )

In [20]:
import pandas as pd
from IPython.display import display, HTML

INDEX_NAME = 'idx:product-vss'
itemRealSearch = []

def create_query_table(query, queries, encoded_queries, extra_params = {}):
    results_list = []
    for i, encoded_query in enumerate(encoded_queries):
        result_docs = client.ft(INDEX_NAME).search(query, { 'query_vector': np.array(encoded_query, dtype=np.float32).tobytes() } | extra_params).docs
        for doc in result_docs:
            vector_score = round(1 - float(doc.vector_score), 2)
            results_list.append({
                'query': queries[i], 
                'score': vector_score, 
                'id': doc.id,
                'name': doc.name,
                # 'summary': doc.summary
            })
            description_embeddings = client.json().get(f'{doc.id}', '$.description_embeddings')
            itemRealSearch.append((description_embeddings[0], (doc.id).split(":")[-1]))

    queries_table = pd.DataFrame(results_list)
    html = queries_table.to_html(index=False, classes='striped_table')  
    display(HTML(html))
    print(len(itemRealSearch))
    return itemRealSearch

In [21]:
itemRealSearch = create_query_table(query, queries, encoded_queries)

query,score,id,name
Nồi cơm điện,0.72,ecommerce:product:98597056,Nồi cơm điện mini đa năng 3 tầng khay inox 304 giữ nhiệt tốt
Nồi cơm điện,0.71,ecommerce:product:67872339,"Hộp cơm điện mini đa năng AS2020 chất liệu inox siêu bền, giữ nhiệt tốt.Hộp cơm hâm nóng, nấu chín thức ăn, cơm."
Nồi cơm điện,0.69,ecommerce:product:148000945,Nồi cơm điện MIDEA MB-FS4020E Nấu cơm nhanh chín Tiết kiệm điện Dung tích 1.5L Nhiều chế độ nấu Hàng Chính Hãng
Nồi cơm điện,0.68,ecommerce:product:147325475,Nồi lẩu kiêm chảo điện đa năng cho gia đình
Nồi cơm điện,0.68,ecommerce:product:56612474,Hộp Cơm Giữ Nhiệt INOX304 Hiện Đại Kèm Hộp Đựng Canh Soup
Nồi cơm điện,0.68,ecommerce:product:91339961,Nồi cơm điện Mini Kim Cương 0.3L - Chỉ 1 người ăn - Hàng chính hãng
Nồi cơm điện,0.67,ecommerce:product:185816376,"Nồi cơm điện KIPOR KP-N25912 - 1.2L - Phủ chống dính HOÀNG KIM, Nồi cơm có tặng kèm vỉ hấp - phù hợp 2-3 người ăn - Hàng chính hãng"
Nồi cơm điện,0.66,ecommerce:product:186070787,Nồi Cơm Điện Kim Cương 0.6 Lít Nắp Rời - Hàng Chính Hãng
Nồi cơm điện,0.66,ecommerce:product:113215758,Nồi hầm cháo - nồi hâm chậm tiết kiệm điện
Nồi cơm điện,0.66,ecommerce:product:186069941,"Nồi Cơm Điện Kim Cương 0.3 Lít - Mini, Nắp Rời"


10


In [17]:
create_query_table(query, queries, encoded_queries)

0


[]

In [31]:
print(itemRealSearch)

None


# a

In [113]:
scoreList = []
itemScoreList = []
for i in itemRealSearch:
    for j in itemSessionVector:
        score = cosine_similarity([i[0]],[j[0]])*j[1]
        scoreList.append(score)
        itemScoreList.append(i[1])

rank = scipy.stats.rankdata(scoreList)

# List of index of rank => find item id in itemScoreList (top 10)
rankScoreIndex = []
for i in range(1,11):
    a = np.where(rank == i)
    if len(a) > 0:
        rankScoreIndex = np.append(rankScoreIndex,a)
    else:
        break

# List of item id in itemScoreList ranked base-on score (de-duplicated)
rankItemId = []
for i in rankScoreIndex:
    if itemScoreList[int(i)] not in rankItemId:
        rankItemId.append(itemScoreList[int(i)])
    else:
        continue

rankItemId


['8680475',
 '71158657',
 '193898814',
 '44371242',
 '13151067',
 '47514109',
 '20427009',
 '173886106']

In [4]:
# sample dataframe
#      user1 user2
# item1
# item2
# Check item which user is not rank
def check_item_not_rank(user, rankItemId, UserItem_DF):
    
    nescessaryUserDF = UserItem_DF.loc[UserItem_DF['user'] == user].isin(rankItemId)
    
    listItemNotRated = []
    listItemRated = []

    for ite in rankItemId:
        if nescessaryUserDF[ite][user] == 0:
            listItemNotRated.append(nescessaryUserDF[ite][user])
        else:
            listItemRated.append(nescessaryUserDF[ite][user])

    
    return listItemNotRated, listItemRated


