In [26]:
import json
from rapidfuzz import process, fuzz
import re


def normalize_inci(text):
    """
    Normalize INCI name for better matching.
    - Lowercase
    - Replace separators with space
    - Remove punctuation
    - Collapse multiple spaces
    """
    text = text.lower()
    text = text.replace('/', ' ')
    text = text.replace('-', ' ')
    text = re.sub(r'[^a-z0-9\s]', '', text)  # Remove non-alphanumeric characters
    text = re.sub(r'\s+', ' ', text)         # Collapse multiple spaces
    return text.strip()


def fuzzy_lookup(query, database, limit=5, threshold=90):
    from rapidfuzz import process, fuzz

    normalized_query = normalize_inci(query)
    matches = process.extract(
        normalized_query,
        database.keys(),
        scorer=fuzz.token_set_ratio,
        limit=limit
    )

    results = []
    for match, score, _ in matches:
        if score >= threshold:
            results.append({
                "match": match,
                "score": score,
                "data": database[match]
            })

    return results


# Load CosIng JSON database
with open("src/output.json", "r", encoding="utf-8") as f:
    cosing_data = json.load(f)

# Index by INCI name (case-insensitive)
# Build normalized index: normalized INCI -> original data
normalized_index = {}
for entry in cosing_data.values():
    normalized_name = normalize_inci(entry["inci_name"])
    normalized_index[normalized_name] = entry


def get_inci_info(name, database):
    name = name.strip().lower()
    return database.get(name)


def api_lookup(name):
    res = fuzzy_lookup(name, normalized_index)
    if res:
        for R in res:
            data = R["data"]
            if data:
                return data
    return None


def get_inci_code(name):
    ref = api_lookup(name)
    return ref and int(ref['reference_number']) or -1


def test01():
    ingredients_to_lookup = [
        "baba",
        "isoeugenol",
        "tocopherol",
        "lactobacillus",
        "arnica powder",
        "berberis",
        "massoy bark extract",
        "glycerin" 
    ]
    for ingredient in ingredients_to_lookup:
        ref = get_inci_code(ingredient)
        print(f"{ingredient} → {ref}")