In [6]:
import pandas as pd
from google.cloud import bigquery
import json
import auth
import requests
import subprocess
import pandas as pd
import numpy as np
import logging 
import warnings
from google.cloud import bigquery
from prophet import Prophet
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed


warnings.filterwarnings("ignore", category=DeprecationWarning, message=".*__init__.*")
logging.getLogger('cmdstanpy').setLevel(logging.WARNING)

product_retrieval_endpoint = "https://australia-southeast1-aiplatform.googleapis.com/v1/projects/639471061669/locations/us-central1/endpoints/8303344687196930048:predict"

# Get Google Cloud Access Token
def get_gcloud_access_token():
    try:
        # Run the gcloud command to get the access token
        result = subprocess.run(
            ['gcloud', 'auth', 'print-access-token'],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            check=True
        )
        
        # Decode the output to get the token as a string
        access_token = result.stdout.decode('utf-8').strip()
        
        return access_token
    except subprocess.CalledProcessError as e:
        # Handle errors in the subprocess call
        print(f"An error occurred: {e.stderr.decode('utf-8')}")
        return None

# Call Cold Start Model and retrieve the top 3 (default) most similar produts
def get_retrieval_produts (kws,top_n=1000,score_threshold=0.5): #CHECK THIS
    
    # Get the access token
    token = get_gcloud_access_token()
    bearer_token = 'Bearer ' + token
    
    converted_kw = [{"term": kw} for kw in kws]
    
    payload = json.dumps({
    "instances": converted_kw,
    "parameters": {"max_results": top_n}
    })
    
    headers = {
        'Authorization': bearer_token,
        'Content-Type': 'application/json'
    }
    
    response = requests.request("POST", product_retrieval_endpoint, headers=headers, data=payload)
    
    if response.status_code == 200:
        data = response.json()
        results = pd.DataFrame.from_records(data['predictions']).explode(['scores','articles'])\
                .reset_index(drop=True).rename(columns = {'articles':'product_nbr', 'keyword':'kw'})
        
        
        results = results[results['scores'] >= score_threshold]
        return results

In [9]:
## Reading Facets data
facets = pd.read_csv("facets_b2c.csv")

In [10]:
search_phrases = dict(facets.SearchPhrase)
facet_names = facets.FacetDisplayName

In [18]:
result_dict = facets.groupby('SearchPhrase')['FacetDisplayName'].apply(list).to_dict()
result_dict

{'BABY ACCESSORIES': ['Wipes',
  'Bottles',
  'Teething & Soothers',
  'Bath & Skincare',
  'Baby Health & Safety',
  'Baby Toys',
  'Baby Bibs'],
 'BABY FORMULA': ['0 - 6 months', '6-12 months', '12 months plus'],
 'BACON': ['Diced Bacon', 'Streaky Bacon', 'Shortcut Bacon', 'Middle Bacon'],
 'BAKING': ['Pancake Mix',
  'Chocolate Chips',
  'Cooking Chocolate',
  'Bread Mix',
  'Brownie Mix',
  'Cookie Mix',
  'Muffin Mix',
  'Cake Decorations',
  'Colours & Essences'],
 'BISCUITS': ['Crackers',
  'Cookies',
  'Chocolate Biscuits',
  'Shortbread',
  'Crispbread',
  'Wafer Biscuits'],
 'BREAD': ['White Bread',
  'Wholegrain Bread',
  'Turkish Bread',
  'Banana Bread',
  'Raisin Toast'],
 'BUTTER': ['Butter & Margarine',
  'Unsalted Butter',
  'Salted Butter',
  'Spreadable Butter'],
 'Baby': ['Wipes',
  'Baby Food',
  'Nappies',
  'Nappy Pants',
  'Baby & Toddler Snacks',
  'Baby Formula'],
 'Baby Food': ['Organic Food',
  'Baby Food 4 Months+',
  'Baby Food 6 Months+',
  'Baby Food 8 M

dict_values([['Wipes', 'Bottles', 'Teething & Soothers', 'Bath & Skincare', 'Baby Health & Safety', 'Baby Toys', 'Baby Bibs'], ['0 - 6 months', '6-12 months', '12 months plus'], ['Diced Bacon', 'Streaky Bacon', 'Shortcut Bacon', 'Middle Bacon'], ['Pancake Mix', 'Chocolate Chips', 'Cooking Chocolate', 'Bread Mix', 'Brownie Mix', 'Cookie Mix', 'Muffin Mix', 'Cake Decorations', 'Colours & Essences'], ['Crackers', 'Cookies', 'Chocolate Biscuits', 'Shortbread', 'Crispbread', 'Wafer Biscuits'], ['White Bread', 'Wholegrain Bread', 'Turkish Bread', 'Banana Bread', 'Raisin Toast'], ['Butter & Margarine', 'Unsalted Butter', 'Salted Butter', 'Spreadable Butter'], ['Wipes', 'Baby Food', 'Nappies', 'Nappy Pants', 'Baby & Toddler Snacks', 'Baby Formula'], ['Organic Food', 'Baby Food 4 Months+', 'Baby Food 6 Months+', 'Baby Food 8 Months+', 'Baby & Toddler Snacks', 'Baby Formula'], ['Mince', 'Steak', 'Diced & Strips', 'Sausages & Burgers', 'Stock'], ['Multipacks'], ['Bread Rolls', 'Garlic Bread'], ['

In [7]:
##PRODUCTS DATA 
query = '''
SELECT article as product_nbr, * EXCEPT (article) FROM `gcp-wow-rwds-ai-search-dev.14_elastic.keyword_to_product_price_BUP-MVP` 
WHERE kw = "milk";
'''
client = bigquery.Client()
data = client.query(query).to_dataframe()

In [39]:
def get_products(original_term, facet_names, threshold = 0.65):
    products = get_retrieval_produts(facet_names, score_threshold = threshold)
    
    df = products[["kw","product_nbr"]]
    df["original_term"] = original_term
    return df

In [40]:
d = get_products("MILK", result_dict["MILK"])

In [50]:

df = pd.DataFrame()
for i in result_dict:
    products = get_products(i, result_dict[i])
    df =  pd.concat([df, products], ignore_index=True)

In [52]:
import pandas as pd
from pandas_gbq import to_gbq


# Define your BigQuery project ID and dataset/table name
project_id = 'gcp-wow-rwds-ai-search-dev'  # Replace with your project ID
dataset_name = '99_temp'  # Replace with your BigQuery dataset
table_name = 'facets_products_hackathon'  # Replace with your BigQuery table name

# Move the DataFrame to BigQuery
to_gbq(df, f'{dataset_name}.{table_name}', project_id=project_id, if_exists='replace')

print(f"DataFrame has been uploaded to BigQuery table: {dataset_name}.{table_name}")


100%|██████████| 1/1 [00:00<00:00, 7319.90it/s]

DataFrame has been uploaded to BigQuery table: 99_temp.facets_products_hackathon





In [16]:
results = pd.DataFrame()
for i in result_dict["MILK"]:
    d = get_retrieval_produts([i])
    print(i, d.shape)
    results = pd.concat([results, d], ignore_index = True)
    

Almond Milk (138, 3)
Coconut Milk (134, 3)
Oat Milk (75, 3)
Skim Milk (48, 3)
Condensed Milk (15, 3)
Rice Milk (45, 3)
Full Fat (1, 3)


['Almond Milk',
 'Coconut Milk',
 'Oat Milk',
 'Skim Milk',
 'Condensed Milk',
 'Rice Milk',
 'Full Fat']

In [46]:
data[data.product_nbr == "761604"]

Unnamed: 0,product_nbr,kw,ctgry,subcat,sgmnt,article_with_uom,product_name,similarity,similarity_safe,click_count,avg_price,sgmnt_price_scale,kw_price_scale,product_type
476,761604,milk,BEVERAGES,LONGLIFE MILK - PLANT,LONG LIFE MILK - NUT,761604-EA,Almond Breeze Unsweetened Almond Milk 1l,0.497271,0.96264,13614,2.85393418,0.114286,0.359533,WOW
477,761604,milk,BEVERAGES,LONGLIFE MILK & SOY DRINKS,LONG LIFE MILK - NUT,761604-EA,Almond Breeze Unsweetened Almond Milk 1l,0.497271,0.96264,13614,2.949082638,0.081967,0.374708,WOW
