In [None]:
import pandas as pd
import numpy as np
import re
import nltk

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from sentence_transformers import SentenceTransformer

import ipywidgets as widgets
from IPython.display import display


In [None]:
parquet_path = "food_sampled_200k.parquet"
tsv_path = "awesome-food-allergy-datasets.tsv"

df = pd.read_parquet(parquet_path)
df = df[[c for c in ["ingredients", "brands", "categories", "labels", "name"] if c in df.columns]]
df = df.dropna(subset=["ingredients"])

df_tsv = pd.read_csv(tsv_path, sep="\t")

df.shape


In [None]:
nltk.download("stopwords")
nltk.download("wordnet")

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9, ]", " ", text)
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(t) for t in tokens if t not in stop_words]
    return " ".join(tokens)

# Faster apply
df["clean_ingredients"] = (
    df["ingredients"]
    .astype(str)
    .str.lower()
    .apply(preprocess)
)


In [None]:
import re

ALLERGEN_MAP = {
    "milk": ["milk", "lactose", "casein", "whey", "butter"],
    "egg": ["egg", "albumin", "ovalbumin"],
    "soy": ["soy", "soya", "lecithin", "e322"],
    "peanut": ["peanut", "groundnut", "arachis oil"],
    "wheat": ["wheat", "gluten", "farina", "semolina"],
    "fish": ["fish", "anchovy", "tuna"],
    "shellfish": ["shrimp", "prawn", "crab", "lobster"],
    "tree_nut": ["almond", "cashew", "walnut", "hazelnut"]
}

BIG_8 = list(ALLERGEN_MAP.keys())

for allergen, terms in ALLERGEN_MAP.items():
    pattern = re.compile(r"\b(" + "|".join(terms) + r")\b")
    df[allergen] = df["clean_ingredients"].str.contains(pattern).astype(int)

df[BIG_8].sum()


In [None]:
X = df["clean_ingredients"]
y = df[BIG_8]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


tfidf = TfidfVectorizer(ngram_range=(1, 1), min_df=20, max_df=0.9, max_features=50000, sublinear_tf=True)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

embedder = SentenceTransformer("all-MiniLM-L6-v2")
X_train_embed = embedder.encode(X_train.tolist(), batch_size=32, show_progress_bar=True, convert_to_numpy=True)
X_test_embed = embedder.encode(X_test.tolist(), batch_size=32, show_progress_bar=True, convert_to_numpy=True)


lr = OneVsRestClassifier(LogisticRegression(max_iter=1000, solver="saga", n_jobs=-1, class_weight="balanced"))
lr.fit(X_train_tfidf, y_train)


svm = OneVsRestClassifier(LinearSVC(class_weight="balanced", dual=False))
svm.fit(X_train_embed, y_train)


print("\n--- Logistic Regression (TF-IDF) Performance ---")
print(classification_report(y_test, lr.predict(X_test_tfidf), target_names=BIG_8))

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, classification_report

def get_metrics(y_true, y_pred):
    return {
        'Accuracy': accuracy_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred, average='macro'),
        'Recall': recall_score(y_true, y_pred, average='macro'),
        'F1-Score': f1_score(y_true, y_pred, average='macro')
    }

lr_preds = lr.predict(X_test_tfidf)
svm_preds = svm.predict(X_test_embed)

lr_results = get_metrics(y_test, lr_preds)
svm_results = get_metrics(y_test, svm_preds)

comparison_df = pd.DataFrame({
    'Logistic Regression (TF-IDF)': lr_results,
    'LinearSVC (Embeddings)': svm_results
}).T

print("--- Model Evaluation Comparison ---")
display(comparison_df)

comparison_df.plot(kind='bar', figsize=(10, 6))
plt.title('AllergyX Model Performance Comparison')
plt.ylabel('Score')
plt.xticks(rotation=0)
plt.ylim(0, 1.0)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

print("\nDetailed Multi-label Classification Reports:")
print("-" * 50)
print("LOGISTIC REGRESSION (TF-IDF):")
print(classification_report(y_test, lr_preds, target_names=BIG_8))
print("-" * 50)
print("LINEARSVC (EMBEDDINGS):")
print(classification_report(y_test, svm_preds, target_names=BIG_8))

In [None]:
def predict_allergens(text):
    text = preprocess(text)

    xt = tfidf.transform([text])
    xe = embedder.encode([text], convert_to_numpy=True)

    p_lr = lr.predict_proba(xt)
    p_svm = svm.decision_function(xe)

    # normalize SVM scores
    p_svm = 1 / (1 + np.exp(-p_svm))

    final = 0.6 * p_lr + 0.4 * p_svm

    return {
        a: round(float(p), 3)
        for a, p in zip(BIG_8, final[0])
        if p > 0.6
    }


In [None]:
input_box = widgets.Textarea(
    value="",
    placeholder="Enter ingredient list here",
    layout=widgets.Layout(width="100%", height="120px")
)

output_box = widgets.Output()

button = widgets.Button(description="Detect Allergens", button_style="danger")

def on_click(b):
    output_box.clear_output()
    with output_box:
        result = predict_allergens(input_box.value)
        if result:
            for k, v in result.items():
                print(f"{k.upper()} : {v}")
        else:
            print("No major allergens detected")

button.on_click(on_click)

display(input_box, button, output_box)
