In [144]:
import requests
import json
import pandas as pd
import redis
from bs4 import BeautifulSoup

In [27]:
def search(search_method,query):
    url = f"http://127.0.0.1:8000/{search_method}/?query={query}"
    print(url)
    headers = {}
    headers['Content-Type'] = 'application/json'
    response = requests.get(url, headers=headers).json()
    results = []
    if search_method == 'elasticsearch':
        for result in response['hits']:
            results.append(result['_id'])
    elif search_method == 'semanticsearch':
        for result in response:
            results.append(result['id'])
    return results

In [28]:
queries = pd.read_csv('final_query_sample.csv')

In [30]:
queries['elastic_results'] = queries['query'].apply(lambda x:search('elasticsearch',x))

http://127.0.0.1:8000/elasticsearch/?query=chocolate thins
http://127.0.0.1:8000/elasticsearch/?query=girls briefs
http://127.0.0.1:8000/elasticsearch/?query=essences
http://127.0.0.1:8000/elasticsearch/?query=bao
http://127.0.0.1:8000/elasticsearch/?query=beans sprout
http://127.0.0.1:8000/elasticsearch/?query=butter devonda
http://127.0.0.1:8000/elasticsearch/?query=powerade zero
http://127.0.0.1:8000/elasticsearch/?query=paracetamol caplets
http://127.0.0.1:8000/elasticsearch/?query=arnotts granitas
http://127.0.0.1:8000/elasticsearch/?query=always olives
http://127.0.0.1:8000/elasticsearch/?query=onion oil
http://127.0.0.1:8000/elasticsearch/?query=decor glass
http://127.0.0.1:8000/elasticsearch/?query=masterfood marinades
http://127.0.0.1:8000/elasticsearch/?query=peas coles
http://127.0.0.1:8000/elasticsearch/?query=aoli
http://127.0.0.1:8000/elasticsearch/?query=curry rice
http://127.0.0.1:8000/elasticsearch/?query=apple scroll
http://127.0.0.1:8000/elasticsearch/?query=minced p

In [31]:
queries['semantic_results'] = queries['query'].apply(lambda x:search('semanticsearch',x))

http://127.0.0.1:8000/semanticsearch/?query=chocolate thins
http://127.0.0.1:8000/semanticsearch/?query=girls briefs
http://127.0.0.1:8000/semanticsearch/?query=essences
http://127.0.0.1:8000/semanticsearch/?query=bao
http://127.0.0.1:8000/semanticsearch/?query=beans sprout
http://127.0.0.1:8000/semanticsearch/?query=butter devonda
http://127.0.0.1:8000/semanticsearch/?query=powerade zero
http://127.0.0.1:8000/semanticsearch/?query=paracetamol caplets
http://127.0.0.1:8000/semanticsearch/?query=arnotts granitas
http://127.0.0.1:8000/semanticsearch/?query=always olives
http://127.0.0.1:8000/semanticsearch/?query=onion oil
http://127.0.0.1:8000/semanticsearch/?query=decor glass
http://127.0.0.1:8000/semanticsearch/?query=masterfood marinades
http://127.0.0.1:8000/semanticsearch/?query=peas coles
http://127.0.0.1:8000/semanticsearch/?query=aoli
http://127.0.0.1:8000/semanticsearch/?query=curry rice
http://127.0.0.1:8000/semanticsearch/?query=apple scroll
http://127.0.0.1:8000/semanticsear

In [127]:
def get_the_next_item(item_list, idx, common_list ):
    if len(item_list)==0:
        return None, None
    while item_list[idx] in common_list:
        # print('next list 1 value already be found')
        if idx+1 >= len(item_list):
            return None, None
        idx = idx +1
    return idx, item_list[idx]
        

In [130]:
def interleave(list1, list2, k):
    # print(f"list 1 :{len(list1)}; list 2 :{len(list2)}")
    interleaved_list = []
    flag = 0
    idx_1 = 0
    idx_2 = 0
    max_len = len(set(list1+list2))
    while len(interleaved_list)<k and len(interleaved_list)< max_len:
        if flag == 0:
            idx_1, value = get_the_next_item(list1, idx_1, interleaved_list )
            if value !=None:
                # print(f'adding from list1, index{idx_1}, value is {value}')
                interleaved_list.append(value)
            flag = 1
        else:
            idx_2, value = get_the_next_item(list2, idx_2, interleaved_list )
            if value !=None:
                # print(f'adding from list2, index{idx_2}, value is {value}')
                interleaved_list.append(value)
            flag = 1
    return interleaved_list
            

In [129]:
interleave([],[1,69,4,5,6,7,8,9,20,37,88],10)

list 1 :0; list 2 :11


[1, 69, 4, 5, 6, 7, 8, 9, 20, 37]

In [131]:
queries['merged_results']=queries[['elastic_results','semantic_results']].apply(lambda x:interleave(x[0], x[1], 10),axis=1)

In [132]:
def jaccard_set(list1, list2):
    """Define Jaccard Similarity function for two sets"""
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union

In [137]:
pool = redis.ConnectionPool(host='localhost', port=6379, db=0)
redis_docs = redis.Redis(connection_pool=pool)

In [133]:
queries['jaccard'] = queries[['elastic_results','semantic_results']].apply(lambda x:jaccard_set(x[0],x[1]),axis=1)

In [163]:
queries.to_csv('final_query_set.csv',index=False)

In [164]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(queries, test_size=0.2)

In [165]:
rating_task = []
for i, row in train.iterrows():
    query = row['query']
    results = row['merged_results']
    results = redis_docs.mget(results)
    for result in results:
        result = json.loads(result)
        soup = BeautifulSoup(result['product_details'], 'html.parser')
        
        # result = {'id':result['id'],
        #                          'title':result['title'],
        #                          'price':result['price'],
        #                          'packageprice':result['package_price'],
        #                          'productdetails':soup.text,
        #                          'productimage':result['product_image'],
        #                          'url':result['url']}
        rating_task.append([query,result['id'], result['title'], soup.text,result['product_image'], result['url']])

In [167]:
rating_task_train = pd.DataFrame(rating_task,columns=['query','id','title','details','img_url','url'])

In [168]:
rating_task_train.to_csv('/Users/vli/Work/RetailSearch/MagpieSearch/data/rating_tasks_train.csv',index=False)

In [169]:
rating_task_train.shape

(7110, 6)

In [170]:
rating_task = []
for i, row in test.iterrows():
    query = row['query']
    results = row['merged_results']
    results = redis_docs.mget(results)
    for result in results:
        result = json.loads(result)
        soup = BeautifulSoup(result['product_details'], 'html.parser')
        
        # result = {'id':result['id'],
        #                          'title':result['title'],
        #                          'price':result['price'],
        #                          'packageprice':result['package_price'],
        #                          'productdetails':soup.text,
        #                          'productimage':result['product_image'],
        #                          'url':result['url']}
        rating_task.append([query,result['id'], result['title'], soup.text,result['product_image'], result['url']])

In [171]:
rating_task_test = pd.DataFrame(rating_task,columns=['query','id','title','details','img_url','url'])

In [172]:
rating_task_test.to_csv('/Users/vli/Work/RetailSearch/MagpieSearch/data/rating_tasks_test.csv',index=False)

In [173]:
rating_task_test.shape

(1780, 6)

In [175]:
rating_task_test[:50].to_csv('/Users/vli/Work/RetailSearch/MagpieSearch/data/rating_tasks_test_sample.csv',index=False)