<a href="https://colab.research.google.com/github/te-supreme-beef/fresho/blob/main/fresho_assessment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Three Step Approach

### 1. Biencoder
  * Format extracted strings
  * Generate an embedding for all unique product_description = product_name, quantity type name pairs from supplier df
  * Generate an embedding for extracted product_descriptions from predictions_df
  * Filter to top 50 product_decriptions from supplier_df for each predictions_df extracted product description.
    * Make sure to suggested_proudct_description if not included.
  * Accuracy of teop 50 recs puts upper limit on cross encoder performance.

### 2. Cross Encoder and rerank
  * Train cross encoder model for more detailed embedding similarities.
  * Evaluation metric f1.
  * Use trained cross encoder to evaluate extracted_product_description from predictions_df to get product_id, quantity_type_id and product_name

### 3. Quantity - didn't get to this

* Regression/Classification: This can be a regression problem on the value of extracted_quantity, or a multi-class classification if quantities are limited (e.g., 1, 2, 5, 10).

In [None]:
!pip install pandasql
!pip install sentence-transformers
!pip install datasets accelerate evaluate scikit-learn
!pip install transformers --upgrade
!pip install pandasql

#Imports

In [None]:
import json
import os
import pandas as pd
from pandasql import sqldf
import re
from sentence_transformers import SentenceTransformer
import random
import numpy as np
from sklearn.model_selection import train_test_split
from collections import defaultdict
from tqdm.notebook import tqdm
from sklearn.metrics.pairwise import cosine_similarity



#Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

#Load Data

In [None]:
drive_path = '/content/drive/My Drive/'
directory_name = 'fresho/fresho_take_home'
file_name = 'predictions.jsonl'

full_path = os.path.join(drive_path, directory_name, file_name)

data = []
if os.path.exists(full_path):
    with open(full_path, 'r') as f:
        for line in f:
            data.append(json.loads(line))
    print(f"Successfully loaded {len(data)} lines from {full_path}.")
    # Display the first few entries to verify
    if data:
        print("First 5 entries:")
        for item in data[:5]:
            print(item)
    else:
        print("The .jsonl file was empty.")
else:
    print(f"Error: File not found at '{full_path}'. Please check the file name and path.")

predictions_df = pd.DataFrame(data)
display(predictions_df.head())

In [None]:

file_name = 'supplier_inventory.jsonl'

full_path = os.path.join(drive_path, directory_name, file_name)

data = []
if os.path.exists(full_path):
    with open(full_path, 'r') as f:
        for line in f:
            data.append(json.loads(line))
    print(f"Successfully loaded {len(data)} lines from {full_path}.")
    if data:
        print("First 5 entries:")
        for item in data[:5]:
            print(item)
    else:
        print("The .jsonl file was empty.")
else:
    print(f"Error: File not found at '{full_path}'. Please check the file name and path.")

supplier_df = pd.DataFrame(data)
display(supplier_df.head())

# Product Name Standardization

In [None]:
def standardize_name(name):
    if isinstance(name, str):
        # Convert to lowercase
        name = name.lower()
        # 1. Standardize 'x - y' to 'x to y' or similar.
        name = re.sub(r'(\d+)\s*-\s*(\d+)', r'\1 to \2', name, flags=re.IGNORECASE)
        # It might be safer to just ensure there is a space around the hyphen if it's numerical:
        name = re.sub(r'(\d)([/-])(\d)', r'\1 \2 \3', name)
        #Removes commas, quotes, periods, etc., that don't change the meaning (e.g., "Stew Pack" vs "Stew Pack.").
        name = re.sub(r'[^\w\s]', '', name)
        # Replace slashes with spaces
        name = re.sub(r'[\\/]', ' ', name)
        # Replaces multiple spaces, newlines, or tabs with a single space (e.g., "Stew\nPack" $\rightarrow$ "Stew Pack").
        name = re.sub(r'\s+', ' ', name).strip()
    return name

supplier_df['standardized_product_name'] = supplier_df['product_name'].apply(standardize_name)

predictions_df['standardized_extracted_product_name'] = predictions_df['extracted_product_name'].apply(standardize_name)
predictions_df['standardized_confirmed_product_name'] = predictions_df['confirmed_product_name'].apply(standardize_name)
predictions_df['standardized_suggested_product_name'] = predictions_df['suggested_product_name'].apply(standardize_name)

display(predictions_df[['extracted_product_name', 'standardized_extracted_product_name', 'confirmed_product_name', 'standardized_confirmed_product_name', 'suggested_product_name', 'standardized_suggested_product_name']].head())


# Quantity Type Name

In [None]:
# Generate quantity types for standardization rules

query = '''
select * from (
select
extracted_quantity_type_name, count(*) as total_types
from predictions_df
group by 1
order by count(*) desc
)iq where total_types > 50
'''

# Execute the SQL query
sql_result = sqldf(query)

# Display the results
display(sql_result)

# drive_path = '/content/drive/My Drive/'
# directory_name = 'fresho/fresho_take_home/quantity_type_names.csv'

# quantity_type_full_path = os.path.join(drive_path, directory_name)

# sql_result.to_parquet(quantity_type_full_path, index=False)


In [None]:
import re
import pandas as pd

# Dictionary for fixing specific OCR errors found in your data file
TYPO_CORRECTION = {
    'unch': 'bunch',
    'bncet': 'bunch',
    'eteh': 'each',
    'unie': 'unit',
    'counei': 'count',
    'coune': 'count',
    'anett': 'punnet',
    'acke': 'pack',
    'pac': 'pack',
    'ba': 'bag',
    'bot': 'bottle',
    'bottl': 'bottle',
    'kge': 'kg',
    'ilogram': 'kg',
    'kilogram': 'kg',
    'gram': 'g',
    'litr': 'l',
    'ltr': 'l',
    'mlt': 'ml',
    'crt': 'carton',
    'ctn': 'carton',
    'bx': 'box',
    'caez': 'case',
    'ase': 'case',
    'bdl': 'bundle',
    'doz': 'dozen',
    'ea': 'each',
    'pre-ao': 'pack',
    'unknow': 'unit',
}

# Conversion factors: All weights/volumes standardized to base units (kilograms/liters)
UNIT_CONVERSIONS = {
    # Weight (Standardized to Kilograms)
    'kg': 1.0,
    'kilo': 1.0,
    'g': 0.001,
    'gram': 0.001,
    'lb': 0.453592,
    'oz': 0.0283495,

    # Volume (Standardized to Liters)
    'l': 1.0,
    'liter': 1.0,
    'ml': 0.001,
    'milliliter': 0.001,
    'fl oz': 0.0295735,
    'qt': 0.946353,

    # Count (Standardized to Count 'ct')
    'ct': 1.0,
    'count': 1.0,
    'dozen': 12.0,
    'each': 1.0,
    'bunch': 1.0,
    'unit': 1.0,
    'case': 1.0,
    'box': 1.0,
    'tub': 1.0,
    'punnet': 1.0,
    'carton': 1.0,
    'pcs': 1.0,
    'piece': 1.0,
    'pc': 1.0,
}

# Mapping of messy packaging terms to canonical categories
PACKAGING_ABSTRACTIONS = {
    # Flexible/Soft Containers
    'bag': 'container_flexible',
    'pouch': 'container_flexible',
    'package': 'container_flexible',
    'wrapper': 'container_flexible',
    'sleeve': 'container_flexible',

    # Rigid/Hard Containers
    'box': 'container_rigid',
    'carton': 'container_rigid',
    'case': 'container_rigid',
    'clamshell': 'container_rigid',
    'tray': 'container_rigid',
    'punnet': 'container_rigid',
    'tub': 'container_rigid',

    # Liquid Containers
    'bottle': 'container_liquid',
    'jug': 'container_liquid',
    'can': 'container_liquid',
    'jar': 'container_liquid',

    # Discrete Units
    'unit': 'single_unit',
    'each': 'single_unit',
    'pc': 'single_unit',
    'piece': 'single_unit',

    # Placeholder for non-quantifiable items
    'variable': 'variable_weight',
    'bulk': 'variable_weight',
}


def clean_special_chars(text: str) -> str:
    """
    Step 0 (Pre-processing): Removes noise characters and handles specific formatting.
    Refactored to aggressively replace non-alphanumeric, non-space, non-decimal
    characters with spaces first.
    """
    if not isinstance(text, str):
        return ""

    text = text.lower()

    # 1. Replace 'x' with space if used as a separator (e.g., 1x1ea -> 1 1ea)
    if 'box' not in text:
      text = text.replace('x', ' ')

    # 2. Replace all non-alphanumeric/non-decimal characters with space
    # This aggressively handles: ( ) , - / % ~ ! etc.
    text = re.sub(r'[^a-z0-9\s\.]', ' ', text)

    # 3. Insert space between Numbers (incl. decimals) and Letters (e.g., "10kg" -> "10 kg")
    # Using a non-capturing group for the decimal part for clean separation
    text = re.sub(r'(\d+(?:\.\d+)?)([a-z]+)', r'\1 \2', text)

    # 4. Insert space between Letters and Numbers (e.g., "No1" -> "No 1")
    text = re.sub(r'([a-z]+)(\d+)', r'\1 \2', text)

    # 5. Collapse multiple spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text


def apply_typo_correction(text: str) -> str:
    """
    Step 0.5 (Correction): Maps individual tokens using the typo dictionary.

    FIXED: The logic for skipping single characters now correctly excludes
    numbers and only targets single letters that are not standard unit abbreviations.
    """
    words = text.split()
    corrected_words = []

    standard_single_letter_units = ['g', 'l', 'm', 'k']

    for word in words:
        if word in TYPO_CORRECTION:
            corrected_words.append(TYPO_CORRECTION[word])

        # Skip single *letters* that aren't standard unit abbreviations (OCR noise)
        # This now correctly keeps single-digit numbers like '2' or '5'.
        elif len(word) == 1 and word.isalpha() and word not in standard_single_letter_units:
            continue

        else:
            corrected_words.append(word)

    return " ".join(corrected_words)


def perform_unit_conversion(value, raw_unit):
    """Helper to convert value/unit pair if unit is valid."""
    # SORT KEYS BY LENGTH DESCENDING: matches 'fl oz' before 'oz'
    sorted_keys = sorted(UNIT_CONVERSIONS.keys(), key=len, reverse=True)

    # Find the longest matching unit key
    unit_key = next((key for key in sorted_keys if raw_unit.startswith(key)), None)

    if unit_key:
        conversion_factor = UNIT_CONVERSIONS[unit_key]
        canonical_value = value * conversion_factor

        # Determine the standardized unit display name
        if unit_key in ['kg', 'g', 'lb', 'oz', 'kilo', 'gram']:
            unit_display = 'kg'
        elif unit_key in ['l', 'ml', 'fl oz', 'qt', 'liter', 'milliliter']:
            unit_display = 'L'
        elif unit_key in ['ct', 'count', 'dozen', 'each', 'bunch', 'unit', 'case', 'box', 'punnet', 'pcs', 'pc']:
            unit_display = 'ct'
        else:
            unit_display = unit_key

        return f"{canonical_value:.3f} {unit_display}"
    return None

def canonicalize_units(quantity_text: str) -> str:
    """
    Step 1: Parses cleaned quantity text, converts to canonical units (kg, L, ct).
    Prioritizes specific units (Weight/Volume) over generic counts.
    """

    # Collect ALL matches of the "Number Unit" pattern (e.g. "1 each", "2 kg")
    # Regex updated to robustly capture decimals: (\d+(?:\.\d+)?)
    candidates = []
    for match in re.finditer(r'(\d+(?:\.\d+)?)\s?([a-zA-Z\s\.]+)', quantity_text):
        value = float(match.group(1))
        raw_unit = match.group(2).lower().strip()
        result = perform_unit_conversion(value, raw_unit)
        if result:
            candidates.append(result)

    # Filter candidates to prioritize Weight (kg) or Volume (L)
    priority_candidates = [res for res in candidates if res.endswith(' kg') or res.endswith(' L')]

    # If we found a priority match (e.g., "1.000 kg"), return it immediately
    if priority_candidates:
        return priority_candidates[0]

    # Check Strategy 2: "Unit Number" pattern (e.g. "Box 40") - Less common, but useful
    match_reverse = re.search(r'([a-zA-Z\s\.]+)\s?(\d+(?:\.\d+)?)', quantity_text, re.IGNORECASE)
    if match_reverse:
        raw_unit = match_reverse.group(1).lower().strip()
        value_str = match_reverse.group(2)
        if value_str:
            value = float(value_str)
            result = perform_unit_conversion(value, raw_unit)
            if result:
                return result

    # Fallback: if we found any generic candidates in Strategy 1 (e.g. "1 ct"), return the first one
    if candidates:
        return candidates[0]

    return quantity_text.lower().strip()


def normalize_format(text: str) -> str:
    """
    Step 2: Basic normalization (lowercasing, punctuation removal).
    This function is now mostly redundant but kept for flow.
    """
    text = text.lower()
    # Note: Using the same aggressive special char removal as in clean_special_chars
    # But only removing punctuation, not splitting numbers/letters again
    text = re.sub(r'[^a-z0-9\s\.]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text


def abstract_packaging(text: str) -> str:
    """
    Step 3: Maps specific packaging terms to a broader category.
    """
    words = text.split()

    for word in words:
        if word in PACKAGING_ABSTRACTIONS:
            return PACKAGING_ABSTRACTIONS[word]

    if any(key in text for key in ['variable', 'bulk', 'loose']):
        return 'variable_weight'

    return 'unspecified_packaging'


def standardize_quantity_type(raw_quantity_type: str) -> str:
    """
    MAIN FUNCTION: Combines cleaning, correction, and standardization steps.
    Returns the final standardized string.
    """
    if not raw_quantity_type or pd.isna(raw_quantity_type):
        return "unspecified_quantity"

    # 1. Clean noise ("Each (1KG)" -> "each 1 kg")
    cleaned_text = clean_special_chars(str(raw_quantity_type))

    # 2. Fix OCR typos
    typo_fixed_text = apply_typo_correction(cleaned_text)

    # 3. Canonicalize units (Extracts measurement: "each 1 kg" -> "1.000 kg")
    canonical_unit_str = canonicalize_units(typo_fixed_text)

    # 4. Normalize format (for packaging abstraction)
    normalized_text = normalize_format(typo_fixed_text)

    # 5. Abstract packaging ("each 1 kg" -> "single_unit")
    packaging_category = abstract_packaging(normalized_text)

    # Check if a numerical unit was successfully extracted
    # If the string is a canonical unit (e.g., "1.000 kg"), use it with the packaging category.
    if re.search(r'^\d+\.\d+\s(kg|L|ct)$', canonical_unit_str):
        return f"{canonical_unit_str} in {packaging_category}"
    else:
        # Fallback to abstract category or clean text
        if packaging_category != 'unspecified_packaging':
            return packaging_category
        return normalized_text

    return typo_fixed_text

In [None]:
supplier_df['standardized_quantity_type_name'] = supplier_df['quantity_type_name'].apply(standardize_quantity_type)

predictions_df['standardized_extracted_quantity_type_name'] = predictions_df['extracted_quantity_type_name'].apply(standardize_quantity_type)
predictions_df['standardized_confirmed_quantity_type_name'] = predictions_df['confirmed_quantity_type_name'].apply(standardize_quantity_type)
predictions_df['standardized_suggested_quantity_type_name'] = predictions_df['suggested_quantity_type_name'].apply(standardize_quantity_type)

display(predictions_df[[
  'extracted_quantity_type_name',
  'standardized_extracted_quantity_type_name',
  'confirmed_quantity_type_name',
  'standardized_confirmed_quantity_type_name'
]].head(20))

# Part 1: Create embeddings and generate topk products

In [None]:
def create_rich_text_separated(standardized_name: str, standardized_quantity_type: str) -> str:

    SEPARATOR = " | "

    name = standardized_name.strip()
    quantity_type = standardized_quantity_type.strip()

    return f"{name}{SEPARATOR}{quantity_type}"

supplier_df['standardized_product_description'] = supplier_df.apply(
    lambda row: create_rich_text_separated(
        row['standardized_product_name'],
        row['standardized_quantity_type_name']
    ),
    axis=1
)

predictions_df['standardized_extracted_product_description'] = predictions_df.apply(
    lambda row: create_rich_text_separated(
        row['standardized_extracted_product_name'],
        row['standardized_extracted_quantity_type_name']
    ),
    axis=1
)

predictions_df['standardized_confirmed_product_description'] = predictions_df.apply(
    lambda row: create_rich_text_separated(
        row['standardized_confirmed_product_name'],
        row['standardized_confirmed_quantity_type_name']
    ),
    axis=1
)

predictions_df['standardized_suggested_product_description'] = predictions_df.apply(
    lambda row: create_rich_text_separated(
        row['standardized_suggested_product_name'],
        row['standardized_suggested_quantity_type_name']
    ),
    axis=1
)

# display(supplier_df['standardized_product_description'].head(20))
display(predictions_df[['standardized_extracted_product_description','standardized_confirmed_product_description','standardized_suggested_product_description']].head(20))

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')

supplier_name_unique_df = supplier_df[['standardized_product_description','product_id','quantity_type_id']].drop_duplicates().reset_index(drop=True)
supplier_name_unique_df['product_desc_embeddings'] = [embedding for embedding in model.encode(supplier_name_unique_df['standardized_product_description'].tolist(), show_progress_bar=True)]
display(supplier_name_unique_df.head())
def get_embedding_for_text(text):

    embedding = model.encode(text, convert_to_numpy=True)

    return embedding

In [None]:
predictions_df['extracted_product_desc_embeddings'] = [embedding for embedding in model.encode(predictions_df['standardized_extracted_product_description'].tolist(), show_progress_bar=True)]
predictions_df['standardized_confirmed_product_desc_embeddings'] = [embedding for embedding in model.encode(predictions_df['standardized_confirmed_product_description'].tolist(), show_progress_bar=True)]
predictions_df['standardized_suggested_product_desc_embeddings'] = [embedding for embedding in model.encode(predictions_df['standardized_suggested_product_description'].tolist(), show_progress_bar=True)]

# Catalogue Filtering

* Goal get a solid hard negative for every extracted product name
  * Use incorrect suggested product_name and quatity_name_type where is_prediction_correct = 0


In [None]:
import torch

device = "cuda"

supplier = torch.tensor(np.vstack(supplier_name_unique_df['product_desc_embeddings'].values), device=device)
queries  = torch.tensor(np.vstack(predictions_df['extracted_product_desc_embeddings'].values), device=device)

supplier = torch.nn.functional.normalize(supplier, dim=1)
queries  = torch.nn.functional.normalize(queries, dim=1)

K = 50
batch_size = 1024

all_topk_idx = []
all_topk_scores = []

for i in range(0, len(queries), batch_size):
    q = queries[i:i+batch_size]

    sim = q @ supplier.T

    # top-K
    scores, idx = torch.topk(sim, K, dim=1)
    all_topk_idx.append(idx.cpu().numpy())
    all_topk_scores.append(scores.cpu().numpy())

topk_idx = np.vstack(all_topk_idx)
topk_scores = np.vstack(all_topk_scores)


In [None]:
supplier_reset = supplier_name_unique_df.reset_index(drop=True)

# Convert supplier metadata columns to NumPy arrays
product_descs = supplier_reset['standardized_product_description'].to_numpy()
product_ids = supplier_reset['product_id'].to_numpy()
product_quantity_ids = supplier_reset['quantity_type_id'].to_numpy()


# Index using topk_idx (2D) â†’ still 2D, then convert to list of lists
topk_product_desc_raw = product_descs[topk_idx].tolist()
topk_product_ids_raw = product_ids[topk_idx].tolist()
topk_product_quantity_ids_raw = product_quantity_ids[topk_idx].tolist()

# For negatives
predictions_df['topk_standardized_product_desc'] = [[] for _ in range(len(predictions_df))]
# For reranking
predictions_df['topk_product_ids'] = [[] for _ in range(len(predictions_df))]
predictions_df['topk_qauntity_type_ids'] = [[] for _ in range(len(predictions_df))]

predictions_df['topk_standardized_product_desc_unfiltered'] = [[] for _ in range(len(predictions_df))]
predictions_df['topk_product_ids_unfiltered'] = [[] for _ in range(len(predictions_df))]
predictions_df['topk_qauntity_type_ids_unfiltered'] = [[] for _ in range(len(predictions_df))]
predictions_df['topk_standardized_product_desc_unfiltered_with_confirmed'] = [[] for _ in range(len(predictions_df))]
predictions_df['topk_product_ids_unfiltered_with_confirmed'] = [[] for _ in range(len(predictions_df))]
predictions_df['topk_qauntity_type_ids_unfiltered_with_confirmed'] = [[] for _ in range(len(predictions_df))]

predictions_df['topk_similarity_score'] = [[] for _ in range(len(predictions_df))]
predictions_df['confirmed_desc_in_topk_product_desc_raw'] = 0

for i, row in predictions_df.iterrows():
    confirmed_product_desc = row['standardized_confirmed_product_description']
    confirmed_desc_in_topk_product_name_raw = 0
    if confirmed_product_desc in topk_product_desc_raw[i]:
      confirmed_desc_in_topk_product_name_raw = 1

    filtered_product_desc = []
    filtered_similarity_scores = []

    current_topk_desc = topk_product_desc_raw[i]
    current_topk_scores = topk_scores[i]
    current_topk_product_ids = topk_product_ids_raw[i]
    current_topk_product_quantity_type_ids = topk_product_quantity_ids_raw[i]

    # suggested product name likely a really strong negative be sure to include
    if row['is_prediction_correct'] == 0 and row['standardized_suggested_product_name'] not in current_topk_desc:
        filtered_product_desc.append(row['standardized_suggested_product_name'])
        filtered_similarity_scores.append(0)

    for j in range(len(current_topk_desc)):
        if topk_product_desc_raw[j] != confirmed_product_desc:
            filtered_product_desc.append(current_topk_desc[j])
            filtered_similarity_scores.append(current_topk_scores[j])

    current_topk_desc_all =current_topk_desc.copy()
    current_topk_product_ids_all =current_topk_product_ids.copy()
    current_topk_product_quantity_type_ids_all =current_topk_product_quantity_type_ids.copy()

    if confirmed_product_desc not in current_topk_desc_all:
      current_topk_desc_all.append(confirmed_product_desc)
      current_topk_product_ids_all.append(row['confirmed_product_id'])
      current_topk_product_quantity_type_ids_all.append(row['confirmed_quantity_type_id'])


    predictions_df.at[i, 'topk_standardized_product_desc'] = filtered_product_desc
    predictions_df.at[i, 'topk_similarity_score'] = filtered_similarity_scores
    predictions_df.at[i, 'topk_standardized_product_desc_unfiltered'] = current_topk_desc
    predictions_df.at[i, 'topk_product_ids_unfiltered'] = current_topk_product_ids
    predictions_df.at[i, 'topk_qauntity_type_ids_unfiltered'] = current_topk_product_quantity_type_ids
    predictions_df.at[i, 'topk_standardized_product_desc_unfiltered_with_confirmed'] = current_topk_desc_all
    predictions_df.at[i, 'topk_product_ids_unfiltered_with_confirmed'] = current_topk_product_ids_all
    predictions_df.at[i, 'topk_qauntity_type_ids_unfiltered_with_confirmed'] = current_topk_product_quantity_type_ids_all

    predictions_df.at[i, 'confirmed_name_in_topk_product_name_raw']= confirmed_desc_in_topk_product_name_raw

display(predictions_df.head())

In [None]:
bicoder_accuracy = (predictions_df['confirmed_name_in_topk_product_name_raw'] == 1).mean()
print(f"Bicoder Accuracy: {bicoder_accuracy}")

In [None]:
# display(predictions_df[['standardized_extracted_product_description','standardized_confirmed_product_description','topk_standardized_product_desc_unfiltered']][predictions_df['confirmed_name_in_topk_product_name_raw'] == 0].head(10))


# FIXME Updated embeddings generation with fine tuning

### Part 2: Cross Encoder

In [None]:
NEGATIVES_PER_POS = 5
TEST_SIZE = 0.1
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

def format_request(name):
    return f"Product: {name}"

def format_candidate(name):
    return f"Product: {name}"

pairs = []
labels = []

for idx, row in tqdm(predictions_df.iterrows(), total=len(predictions_df), desc="Generating cross-encoder pairs"):
    query_name = row['standardized_extracted_product_description']
    confirmed_name = row['standardized_confirmed_product_description']

    # Add positive pair
    pairs.append((format_request(query_name), format_candidate(confirmed_name)))
    labels.append(1)

    # topk_product_name already excludes the confirmed_product_id and includes hard negatives if prediction was incorrect
    available_negatives = row['topk_standardized_product_desc']

    # Ensure we don't accidentally pick the confirmed_name if it somehow reappeared (e.g., due to different standardization)
    available_negatives = [n for n in available_negatives if n != confirmed_name]

    # Sample unique negatives up to NEGATIVES_PER_POS
    sampled_negatives = random.sample(available_negatives, min(NEGATIVES_PER_POS, len(available_negatives)))

    for neg_name in sampled_negatives:
        pairs.append((format_request(query_name), format_candidate(neg_name)))
        labels.append(0)

# Convert to DataFrame
df_pairs = pd.DataFrame(pairs, columns=['query', 'candidate'])
df_pairs['label'] = labels

# Train/validation split
train_df, val_df = train_test_split(df_pairs, test_size=TEST_SIZE, stratify=df_pairs['label'], random_state=SEED)

print(f"Generated {len(df_pairs)} total pairs.")
print(f"Train size: {len(train_df)}, Val size: {len(val_df)}")

display(train_df.head())
display(val_df.head())

In [None]:
from datasets import Dataset
from transformers import AutoTokenizer

MODEL_NAME = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

def tokenize_batch(examples):
    return tokenizer(
        examples['query'],
        examples['candidate'],
        truncation=True,
        padding="max_length",
        max_length=128
    )

train_ds = Dataset.from_pandas(train_df.reset_index(drop=True))
val_ds = Dataset.from_pandas(val_df.reset_index(drop=True))

train_ds = train_ds.map(tokenize_batch, batched=True)
val_ds = val_ds.map(tokenize_batch, batched=True)

columns_to_keep = ['input_ids', 'attention_mask', 'label']
train_ds = train_ds.remove_columns([c for c in train_ds.column_names if c not in columns_to_keep])
val_ds = val_ds.remove_columns([c for c in val_ds.column_names if c not in columns_to_keep])

train_ds.set_format(type='torch', columns=columns_to_keep)
val_ds.set_format(type='torch', columns=columns_to_keep)

print("Train dataset prepared:")
print(train_ds)
print("Validation dataset prepared:")
print(val_ds)


In [None]:
from google.colab import userdata
import wandb

wandb.login(key=userdata.get('WANDB_API_KEY'))

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import evaluate
import os

drive_path = '/content/drive/My Drive/'
directory_name = 'fresho/fresho_take_home/cross-encoder-best'
checkpoint_name = 'fresho/fresho_take_home/cross-encoder-checkpoint'

checkpoint_full_path = os.path.join(drive_path, checkpoint_name)

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

metric_acc = evaluate.load("accuracy")
metric_f1 = evaluate.load("f1")
metric_precision = evaluate.load("precision")
metric_recall = evaluate.load("recall")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    acc = metric_acc.compute(predictions=preds, references=labels)['accuracy']
    f1 = metric_f1.compute(predictions=preds, references=labels, average='binary')['f1']
    precision = metric_precision.compute(predictions=preds, references=labels, average='binary')['precision']
    recall = metric_recall.compute(predictions=preds, references=labels, average='binary')['recall']

    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }

training_args = TrainingArguments(
    output_dir=checkpoint_full_path,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    eval_strategy="steps",
    eval_steps=2000,
    logging_steps=1000,
    save_steps=2000,
    learning_rate=2e-5,
    num_train_epochs=2,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    fp16=True,
    seed=SEED
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Find the latest checkpoint in the output directory
latest_checkpoint = None

# The 'Trainer' will look for subdirectories named 'checkpoint-XXX'
checkpoints = [d for d in os.listdir(checkpoint_full_path) if d.startswith('checkpoint-')]
if checkpoints:
    # Sort to find the latest step number
    latest_checkpoint = os.path.join(checkpoint_full_path, max(checkpoints, key=lambda x: int(x.split('-')[-1])))

# If a checkpoint is found, pass its path to train()
if latest_checkpoint:
    print(f"Resuming training from checkpoint: {latest_checkpoint}")
    trainer.train(resume_from_checkpoint=latest_checkpoint)
else:
    print("No checkpoint found. Starting training from scratch.")
    trainer.train()
full_path = os.path.join(drive_path, directory_name)

trainer.save_model(full_path)

# Reranking
topk_standardized_product_desc
topk_standardized_product_desc_unfiltered_with_confirmed

In [None]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

drive_path = '/content/drive/My Drive/'
directory_name = 'fresho/fresho_take_home/cross-encoder-best'
checkpoint_name = 'fresho/fresho_take_home/cross-encoder-checkpoint'

checkpoint_full_path = os.path.join(drive_path, checkpoint_name)

# Find the latest checkpoint in the output directory
latest_checkpoint = None

# The 'Trainer' will look for subdirectories named 'checkpoint-XXX'
checkpoints = [d for d in os.listdir(checkpoint_full_path) if d.startswith('checkpoint-')]
if checkpoints:
    # Sort to find the latest step number
    latest_checkpoint = os.path.join(checkpoint_full_path, max(checkpoints, key=lambda x: int(x.split('-')[-1])))

# If a checkpoint is found, pass its path to train()
if latest_checkpoint:
    MODEL_DIR = latest_checkpoint
else:
    print("No checkpoint found.")
    MODEL_DIR = full_path

# MODEL_DIR = full_path
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR)
model.eval()
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

def score_pair(query_text, candidate_text):
    inputs = tokenizer(query_text, candidate_text, return_tensors="pt", truncation=True, max_length=128).to(device)
    with torch.no_grad():
        logits = model(**inputs).logits
        probs = torch.softmax(logits, dim=-1).cpu().numpy()[0,1]
    return float(probs)

new_suggested_product_id = []
new_suggested_product_desc = []
new_suggested_quantity_type_id = []
new_scores = []

for idx, row in tqdm(predictions_df.iterrows(), total=len(predictions_df)):
    query_text = format_request(row['standardized_extracted_product_description'])
    topk_descs = row['topk_standardized_product_desc_unfiltered_with_confirmed']
    topk_quantity_type_ids = row['topk_qauntity_type_ids_unfiltered_with_confirmed']
    topk_product_ids = row['topk_product_ids_unfiltered_with_confirmed']

    candidate_texts = [format_candidate(n) for n in topk_descs]

    scores = []
    batch_size = 32
    for i in range(0, len(candidate_texts), batch_size):
        batch_cands = candidate_texts[i:i+batch_size]
        batch_queries = [query_text]*len(batch_cands)
        inputs = tokenizer(batch_queries, batch_cands, truncation=True, padding=True, return_tensors="pt", max_length=128).to(device)
        with torch.no_grad():
            logits = model(**inputs).logits
            probs = torch.softmax(logits, dim=-1)[:,1].cpu().numpy()
        scores.extend(probs.tolist())

    best_idx = int(np.argmax(scores))
    new_suggested_product_desc.append(topk_descs[best_idx])
    new_suggested_quantity_type_id.append(topk_quantity_type_ids[best_idx])
    new_suggested_product_id.append(topk_product_ids[best_idx])
    new_scores.append(scores[best_idx])

predictions_df['re_ranked_product_desc'] = new_suggested_product_desc
predictions_df['re_ranked_product_id'] = new_suggested_product_id
predictions_df['re_ranked_quantity_type_id'] = new_suggested_quantity_type_id
predictions_df['re_ranked_score'] = new_scores

drive_path = '/content/drive/My Drive/'
directory_name = 'fresho/fresho_take_home/predictions_df.parquet'

predictions_df_full_path = os.path.join(drive_path, checkpoint_name)

predictions_df.to_parquet(predictions_df_full_path, index=False)



In [None]:
results_df = predictions_df[['confirmed_product_id','re_ranked_product_id','confirmed_quantity_type_id','re_ranked_quantity_type_id']][(predictions_df['confirmed_quantity_type_id'] == predictions_df['re_ranked_quantity_type_id']) & (predictions_df['confirmed_product_id'] == predictions_df['re_ranked_product_id'])]
print(f'Accuracy of Part 2 {len(results_df)/len(predictions_df)*100}')

### FIXME Part 3: Use XGBoost to predict quantity