In [1]:
import numpy as np
from sentence_transformers import SentenceTransformer
from IPython.display import clear_output
import time

class CategoryScorer:
    def __init__(self):
        self.model = SentenceTransformer("all-MiniLM-L6-v2")
        
        self.max_description_length = 20
        
        # Define categories and initial descriptions
        self.category_data = {
            "Sleep": [
                "Going to sleep", 
                "Taking a nap", 
                "Heading to bed", 
                "Resting my eyes", 
                "Unconscious", 
                "Catching some Zs"
            ],
            "Break": [
                "Taking a short break", 
                "Pausing work for a bit", 
                "Relaxing for a moment", 
                "Procrastinating", 
                "Chilling out", 
                "Stopping to rest"
            ],
            "Coding": [
                "Writing code", 
                "Debugging an application", 
                "Programming software", 
                "Developing in Python or Swift", 
                "Building a feature", 
                "Fixing a bug"
            ],
            "Errands": [
                "Doing household chores", 
                "Doing laundry", 
                "Washing the dishes", 
                "Cleaning the house", 
                "Grocery shopping", 
                "Tidying up the room"
            ],
            "Fitness": [
                "Working out", 
                "Going for a run", 
                "Lifting weights at the gym", 
                "Doing cardio", 
                "Playing sports", 
                "Exercising"
            ],
            "Meditation": [
                "Meditating", 
                "Practicing mindfulness", 
                "Doing deep breathing exercises", 
                "Yoga session", 
                "Sitting quietly"
            ],
            "Study": [
                "Studying for an exam", 
                "Doing homework", 
                "Reading a textbook", 
                "Writing an essay", 
                "Learning a new subject", 
                "Classwork"
            ],
            "Work": [
                "Working at my job", 
                "Sitting at my desk working", 
                "In a meeting", 
                "Answering emails", 
                "Professional business tasks", 
                "Career work"
            ],
            "Leisure": [
                "Watching TV", 
                "Playing video games", 
                "Watching a movie", 
                "Scroll social media", 
                "Having fun", 
                "Hobby time", 
                "Relaxing on the couch"
            ],
            "Eat": [
                "Eating a meal", 
                "Having breakfast", 
                "Grabbing lunch", 
                "Eating dinner", 
                "Having a snack", 
                "Drinking water"
            ],
            "Commute": [
                "Commuting to work", 
                "Driving to the office", 
                "Walking to school", 
                "Heading to a destination", 
                "Taking the bus or train", 
                "Riding a bike", 
                "Traveling",
                "Going to work" 
            ]
        }
        
        # Split category descriptions
        for category, descriptions in self.category_data.items():
            if len(descriptions) == 1 and "," in descriptions[0]:
                clean_list = [d.strip() for d in descriptions[0].split(',') if d.strip()]
                self.category_data[category] = clean_list
        
        # Pre-calculate vectors for existing categories
        self.category_vectors = {}
        self.calculate_all_category_vectors()

    def predict(self, text):
        similarities = []
        
        # Get vector embedding of input text
        text_embedding = self.model.encode(text, normalize_embeddings=True)
        
        # Get words in input text
        text_words = set(text.lower().split())
        
        # Compare input vector with all category vectors
        for category, category_embedding in self.category_vectors.items():    
            vector_score = np.dot(text_embedding, category_embedding)
            
            description_words = set(self.get_full_description(category))
            intersection = len(text_words.intersection(description_words))
            keyword_boost = 0.1 * intersection
            
            final_score = vector_score + keyword_boost
            
            similarities.append((category, final_score))
            
        # Sort similarities by score
        similarities = sorted(similarities, key=lambda x: x[1], reverse=True)
            
        return similarities
    
    def get_full_description(self, category):
        return [category] + self.category_data[category]
    
    def get_category_vector(self, category):
        full_description = self.get_full_description(category)
        embeddings = self.model.encode(full_description, normalize_embeddings=True)
        embeddings[0, :] *= 3
        
        centroid = np.mean(embeddings, axis=0)
        normalized_centroid = centroid / np.linalg.norm(centroid)
        
        return normalized_centroid
    
    def calculate_all_category_vectors(self):
        for category in self.category_data:
            self.category_vectors[category] = self.get_category_vector(category)
            
    def update_category(self, category, text):
        if category not in self.category_data:
            self.category_data[category] = []
        
        self.category_data[category].append(text)
        
        if len(self.category_data[category]) > self.max_description_length:
            self.category_data[category] = self.category_data[category][1:]
        
        self.category_vectors[category] = self.get_category_vector(category)
        
# --- THE INTERACTIVE LOOP ---

scorer = CategoryScorer()

while True:
    print("-" * 60)
    user_input = input("Enter activity (or 'q' to quit): ")
    
    if user_input.lower() in ['q', 'quit', 'exit']:
        print("Goodbye!")
        break
        
    if not user_input.strip():
        continue

    # 1. Get Predictions
    predictions = scorer.predict(user_input)
    top_category = predictions[0][0]
    top_score = predictions[0][1]

    # 2. Display Dashboard
    print(f"\nüìù Input: '{user_input}'")
    print(f"ü§ñ Top Prediction: {top_category} ({top_score:.3f})\n")
    
    print("Scores:")
    for i, (cat, score) in enumerate(predictions[:10]): # Show Top 10
        bar = "‚ñà" * int(score * 20)
        print(f"  [{i}] {cat:<12} {score:.3f}  {bar}")
    
    # 3. User Feedback Loop
    print("\nActions:")
    print("  [Enter] Confirm Top Match")
    print("  [0-9]   Select specific category above")
    print("  [n]     Create NEW Category")
    
    choice = input("Select correct category: ").strip().lower()
    
    selected_category = None
    
    # CASE A: Confirm Top Match
    if choice == "":
        selected_category = top_category
        # Only learn if confidence was low, otherwise skip to save space (Optional rule)
        if top_score < 0.8: 
            scorer.update_category(selected_category, user_input)
        else:
            print("‚úÖ High confidence match. No update needed.")

    # CASE B: Select from List
    elif choice.isdigit() and 0 <= int(choice) < 10:
        idx = int(choice)
        selected_category = predictions[idx][0]
        # This is a correction, so we ALWAYS update
        scorer.update_category(selected_category, user_input)

    # CASE C: Create New Category
    elif choice == 'n':
        new_cat_name = input("Enter name for NEW category: ").strip().title()
        if new_cat_name:
            selected_category = new_cat_name
            # Initialize with user input
            scorer.update_category(selected_category, user_input)

    # CASE D: Invalid
    else:
        print("‚ùå Invalid selection. Learning skipped.")

    # Pause for effect so user can read result
    time.sleep(1)

  from .autonotebook import tqdm as notebook_tqdm


------------------------------------------------------------
Goodbye!


In [2]:
import numpy as np
from sentence_transformers import SentenceTransformer
from IPython.display import clear_output
import time

class CategoryScorer:
    def __init__(self):
        self.model = SentenceTransformer("all-mpnet-base-v2")
        
        self.max_description_length = 20
        
        # Define categories and initial descriptions
        self.category_data = {
            "Sleep": ["Sleep, take a nap, go to bed, rest eyes, unconscious."],
            "Break": ["Take a break, pause work, relax for a moment, procrastination, chill."],
            "Coding": ["Write code, programming, software development, debugging, python, swift."],
            "Errands": ["Do chores, laundry, wash dishes, clean house, buy groceries, housework."],
            "Fitness": ["Workout, go to the gym, run, lift weights, exercise, cardio, sports."],
            "Meditation": ["Meditate, mindfulness, deep breathing, yoga, sit quietly."],
            "Study": ["Study for school, do homework, read textbook, learn new things, class work."],
            "Work": ["Do my job, work at office, professional tasks, business, career, meetings."],
            "Leisure": ["Watch TV, watch a movie, play games, entertainment, hobby, fun, relax."],
            "Eat": ["Eat a meal, have breakfast, lunch, dinner, snack, drink water."],
            "Commute": ["Commute, travel, drive car, walk to place, take bus, train, ride bike."],
        }
        
        # Split category descriptions
        for category, descriptions in self.category_data.items():
            if len(descriptions) == 1 and "," in descriptions[0]:
                clean_list = [d.strip() for d in descriptions[0].split(',') if d.strip()]
                self.category_data[category] = clean_list
        
        # Pre-calculate vectors for existing categories
        self.category_vectors = {}
        self.calculate_all_category_vectors()

    def predict(self, text):
        similarities = []
        
        # Get vector embedding of input text
        text_embedding = self.model.encode(text, normalize_embeddings=True)
        
        # Compare input vector with all category vectors
        for category, category_embedding in self.category_vectors.items():
            similarities.append((category, np.dot(text_embedding, category_embedding)))
            
        # Sort similarities by score
        similarities = sorted(similarities, key=lambda x: x[1], reverse=True)
            
        return similarities
    
    def get_category_vector(self, category):
        full_description = [category] + self.category_data[category]
        embeddings = self.model.encode(full_description, normalize_embeddings=True)
        centroid = np.mean(embeddings, axis=0)
        normalized_centroid = centroid / np.linalg.norm(centroid)
        
        return normalized_centroid
    
    def calculate_all_category_vectors(self):
        for category in self.category_data:
            self.category_vectors[category] = self.get_category_vector(category)
            
    def update_category(self, category, text):
        if category not in self.category_data:
            self.category_data[category] = []
        
        self.category_data[category].append(text)
        
        if len(self.category_data[category]) > self.max_description_length:
            self.category_data[category] = self.category_data[category][1:]
        
        self.category_vectors[category] = self.get_category_vector(category)
        
# --- THE INTERACTIVE LOOP ---

scorer = CategoryScorer()

while True:
    print("-" * 60)
    user_input = input("Enter activity (or 'q' to quit): ")
    
    if user_input.lower() in ['q', 'quit', 'exit']:
        print("Goodbye!")
        break
        
    if not user_input.strip():
        continue

    # 1. Get Predictions
    predictions = scorer.predict(user_input)
    top_category = predictions[0][0]
    top_score = predictions[0][1]

    # 2. Display Dashboard
    print(f"\nüìù Input: '{user_input}'")
    print(f"ü§ñ Top Prediction: {top_category} ({top_score:.3f})\n")
    
    print("Scores:")
    for i, (cat, score) in enumerate(predictions[:10]): # Show Top 10
        bar = "‚ñà" * int(score * 20)
        print(f"  [{i}] {cat:<12} {score:.3f}  {bar}")
    
    # 3. User Feedback Loop
    print("\nActions:")
    print("  [Enter] Confirm Top Match")
    print("  [0-9]   Select specific category above")
    print("  [n]     Create NEW Category")
    
    choice = input("Select correct category: ").strip().lower()
    
    selected_category = None
    
    # CASE A: Confirm Top Match
    if choice == "":
        selected_category = top_category
        # Only learn if confidence was low, otherwise skip to save space (Optional rule)
        if top_score < 0.8: 
            scorer.update_category(selected_category, user_input)
        else:
            print("‚úÖ High confidence match. No update needed.")

    # CASE B: Select from List
    elif choice.isdigit() and 0 <= int(choice) < 10:
        idx = int(choice)
        selected_category = predictions[idx][0]
        # This is a correction, so we ALWAYS update
        scorer.update_category(selected_category, user_input)

    # CASE C: Create New Category
    elif choice == 'n':
        new_cat_name = input("Enter name for NEW category: ").strip().title()
        if new_cat_name:
            selected_category = new_cat_name
            # Initialize with user input
            scorer.update_category(selected_category, user_input)

    # CASE D: Invalid
    else:
        print("‚ùå Invalid selection. Learning skipped.")

    # Pause for effect so user can read result
    time.sleep(1)

------------------------------------------------------------
Goodbye!


In [None]:
data = {
    "Sleep": [
        "Going to sleep", 
        "Taking a nap", 
        "Heading to bed", 
        "Resting my eyes", 
        "Unconscious", 
        "Catching some Zs"
    ],
    "Break": [
        "Taking a short break", 
        "Pausing work for a bit", 
        "Relaxing for a moment", 
        "Procrastinating", 
        "Chilling out", 
        "Stopping to rest"
    ],
    "Coding": [
        "Writing code", 
        "Debugging an application", 
        "Programming software", 
        "Developing in Python or Swift", 
        "Building a feature", 
        "Fixing a bug"
    ],
    "Errands": [
        "Doing household chores", 
        "Doing laundry", 
        "Washing the dishes", 
        "Cleaning the house", 
        "Grocery shopping", 
        "Tidying up the room"
    ],
    "Fitness": [
        "Working out", 
        "Going for a run", 
        "Lifting weights at the gym", 
        "Doing cardio", 
        "Playing sports", 
        "Exercising"
    ],
    "Meditation": [
        "Meditating", 
        "Practicing mindfulness", 
        "Doing deep breathing exercises", 
        "Yoga session", 
        "Sitting quietly"
    ],
    "Study": [
        "Studying for an exam", 
        "Doing homework", 
        "Reading a textbook", 
        "Writing an essay", 
        "Learning a new subject", 
        "Classwork"
    ],
    "Work": [
        "Working at my job", 
        "Sitting at my desk working", 
        "In a meeting", 
        "Answering emails", 
        "Professional business tasks", 
        "Career work"
    ],
    "Leisure": [
        "Watching TV", 
        "Playing video games", 
        "Watching a movie", 
        "Scroll social media", 
        "Having fun", 
        "Hobby time", 
        "Relaxing on the couch"
    ],
    "Eat": [
        "Eating a meal", 
        "Having breakfast", 
        "Grabbing lunch", 
        "Eating dinner", 
        "Having a snack", 
        "Drinking water"
    ],
    "Commute": [
        "Commuting to work", 
        "Driving to the office", 
        "Walking to school", 
        "Heading to a destination", 
        "Taking the bus or train", 
        "Riding a bike", 
        "Traveling",
        "Going to work" 
    ]
}

In [3]:
from scorer import Scorer
import time

model = Scorer(data)
model.initialize_vectors()

while True:
    print("-" * 60)
    input_text = input("Enter activity (or 'q' to quit): ")
    
    if input_text.lower() in ['q', 'quit', 'exit']:
        print("Goodbye!")
        break
        
    if not input_text.strip():
        continue
    
    predictions = model.predict(input_text)
    top_category = predictions[0][0]
    top_score = predictions[0][1]
    
    print(top_score, type(top_score))

    # 2. Display Dashboard
    print(f"\nüìù Input: '{input_text}'")
    print(f"ü§ñ Top Prediction: {top_category} ({top_score:.3f})\n")
    
    print("Scores:")
    for i, (cat, score) in enumerate(predictions[:10]): # Show Top 10
        bar = "‚ñà" * int(score * 20)
        print(f"  [{i}] {cat:<12} {score:.3f}  {bar}")
    
    # 3. User Feedback Loop
    print("\nActions:")
    print("  [Enter] Confirm Top Match")
    print("  [0-9]   Select specific category above")
    print("  [n]     Create NEW Category")
    
    choice = input("Select correct category: ").strip().lower()
    
    selected_category = None
    
    # CASE A: Confirm Top Match
    if choice == "":
        selected_category = top_category
        # Only learn if confidence was low, otherwise skip to save space (Optional rule)
        if top_score < 0.8: 
            model.update_descriptions(selected_category, input_text)
        else:
            print("‚úÖ High confidence match. No update needed.")

    # CASE B: Select from List
    elif choice.isdigit() and 0 <= int(choice) < 10:
        idx = int(choice)
        selected_category = predictions[idx][0]
        # This is a correction, so we ALWAYS update
        model.update_descriptions(selected_category, input_text)

    # CASE C: Create New Category
    elif choice == 'n':
        new_cat_name = input("Enter name for NEW category: ").strip().title()
        if new_cat_name:
            selected_category = new_cat_name
            # Initialize with user input
            model.update_descriptions(selected_category, input_text)

    # CASE D: Invalid
    else:
        print("‚ùå Invalid selection. Learning skipped.")

    # Pause for effect so user can read result
    time.sleep(1)

------------------------------------------------------------
0.6095359921455383 <class 'float'>

üìù Input: 'watch instagram'
ü§ñ Top Prediction: Leisure (0.610)

Scores:
  [0] Leisure      0.610  ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
  [1] Eat          0.482  ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
  [2] Break        0.470  ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
  [3] Meditation   0.436  ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
  [4] Commute      0.417  ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
  [5] Fitness      0.414  ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
  [6] Study        0.413  ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
  [7] Sleep        0.382  ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
  [8] Coding       0.369  ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
  [9] Errands      0.368  ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà

Actions:
  [Enter] Confirm Top Match
  [0-9]   Select specific category above
  [n]     Create NEW Category
------------------------------------------------------------
0.5513965487480164 <class 'float'>

üìù Input: 'watching beaver'
ü§ñ Top Prediction: Leisure (0.551)

Scores:
  [0] Leisure      0.551  

KeyboardInterrupt: Interrupted by user

In [1]:
import numpy as np
from scorer import Scorer

data = {
    "Sleep": [
        "sleeping",
        "going to sleep",
        "taking a nap",
        "napping",
        "heading to bed",
        "lying in bed",
        "trying to sleep",
        "resting",
        "fell asleep",
        "waking up"
    ],

    "Eat": [
        "eating",
        "having a meal",
        "eating food",
        "having breakfast",
        "having lunch",
        "having dinner",
        "grabbing a snack",
        "snacking",
        "drinking water",
        "getting food"
    ],

    "Work": [
        "working",
        "doing work",
        "at work",
        "working on tasks",
        "working on my job",
        "doing my job",
        "office work",
        "working on a project",
        "career work",
        "business work"
    ],

    "Study": [
        "studying",
        "doing homework",
        "studying for an exam",
        "learning",
        "reading notes",
        "reviewing material",
        "doing school work",
        "working on assignments",
        "exam prep",
        "studying concepts"
    ],

    "Commute": [
        "commuting",
        "driving to work",
        "traveling",
        "on the way",
        "heading somewhere",
        "walking to class",
        "taking the bus",
        "riding the train",
        "driving",
        "going somewhere"
    ],

    "Entertainment": [
        "watching tv",
        "watching a show",
        "watching a movie",
        "playing games",
        "gaming",
        "scrolling social media",
        "watching youtube",
        "browsing the internet",
        "entertainment",
        "relaxing with media"
    ],

    "Chores": [
        "doing chores",
        "cleaning",
        "doing laundry",
        "washing dishes",
        "tidying up",
        "housework",
        "organizing",
        "cleaning the house",
        "taking care of chores",
        "running household tasks"
    ],

    "Exercise": [
        "working out",
        "exercising",
        "going to the gym",
        "lifting weights",
        "doing cardio",
        "running",
        "jogging",
        "walking",
        "training",
        "fitness"
    ],

    "Social": [
        "hanging out",
        "spending time with friends",
        "talking with friends",
        "socializing",
        "meeting people",
        "chatting",
        "calling someone",
        "texting",
        "being social",
        "spending time together"
    ],

    "Break": [
        "taking a break",
        "on a break",
        "resting",
        "pausing",
        "stepping away",
        "short break",
        "cooling off",
        "doing nothing",
        "waiting",
        "idle"
    ],

    "Self Care": [
        "self care",
        "taking care of myself",
        "relaxing",
        "meditating",
        "mindfulness",
        "journaling",
        "breathing exercises",
        "therapy",
        "mental health",
        "winding down"
    ],

    "Hobby": [
        "working on a hobby",
        "doing a hobby",
        "creative work",
        "drawing",
        "writing",
        "playing music",
        "practicing an instrument",
        "building something",
        "personal project",
        "doing something I enjoy"
    ]
}

# --- 1. SETUP DATA ---
# (Assuming 'data' is defined in your environment or imported)
# If not, paste the 'category_data' dictionary here from our previous conversation.

labeled_tests = [
    ("went to bed", "Sleep"),
    ("sleeping", "Sleep"),
    ("took a nap", "Sleep"),
    ("power nap", "Sleep"),
    ("resting in bed", "Sleep"),

    ("had breakfast", "Eat"),
    ("ate lunch", "Eat"),
    ("grabbing food", "Eat"),
    ("cooking dinner", "Eat"),
    ("late night snack", "Eat"),

    ("working", "Work"),
    ("coding", "Work"),
    ("team meeting", "Work"),
    ("emails", "Work"),
    ("client call", "Work"),

    ("studying", "Study"),
    ("homework", "Study"),
    ("reviewing notes", "Study"),
    ("exam prep", "Study"),
    ("watching lecture", "Study"),

    ("commuting", "Commute"),
    ("driving to work", "Commute"),
    ("bus ride", "Commute"),
    ("walking to campus", "Commute"),
    ("train ride home", "Commute"),

    ("watching TV", "Entertainment"),
    ("Netflix", "Entertainment"),
    ("playing video games", "Entertainment"),
    ("scrolling TikTok", "Entertainment"),
    ("watching YouTube", "Entertainment"),

    ("cleaning room", "Chores"),
    ("doing laundry", "Chores"),
    ("washing dishes", "Chores"),
    ("taking out trash", "Chores"),
    ("grocery shopping", "Chores"),

    ("working out", "Exercise"),
    ("gym session", "Exercise"),
    ("running", "Exercise"),
    ("yoga", "Exercise"),
    ("lifting weights", "Exercise"),

    ("hanging out with friends", "Social"),
    ("meeting friends", "Social"),
    ("party", "Social"),
    ("chatting with people", "Social"),
    ("catching up with friends", "Social"),

    ("taking a break", "Break"),
    ("coffee break", "Break"),
    ("short break", "Break"),
    ("stepping away", "Break"),
    ("mental break", "Break"),

    ("self care", "Self Care"),
    ("skincare routine", "Self Care"),
    ("meditating", "Self Care"),
    ("journaling", "Self Care"),
    ("taking a bath", "Self Care"),

    ("drawing", "Hobby"),
    ("playing guitar", "Hobby"),
    ("photography", "Hobby"),
    ("writing stories", "Hobby"),
    ("coding for fun", "Hobby"),
]

# List of models to compete against MiniLM-L6
models_to_test = [
    "BAAI/bge-base-en-v1.5",                    # Current Champion (Reference)
    "BAAI/bge-small-en-v1.5",                   # Top Contender (33MB)
    "TaylorAI/gte-tiny",                        # Distilled Logic (29MB)
    "sentence-transformers/all-MiniLM-L6-v2",   # Baseline (22MB)
    "sentence-transformers/paraphrase-MiniLM-L3-v2" # Speed Demon (17MB)
]

# List of K values to test
k_values = [1, 3, 5, 7, 9]

print(f"{'MODEL':<45} | {'K':<3} | {'ACCURACY':<8} | {'SCORE':<8} | {'AVG CONF':<8}")
print("-" * 90)

failures_log = {}

for model_name in models_to_test:
    try:
        model = Scorer(data, model_name)
        
        if "e5-small" in model_name:
            model.query_prefix = "query: "
            model.doc_prefix = "passage: "
        
        model.initialize_vectors()
        
        for k in k_values:
            model.k = k
            
            total_score = 0
            correct_count = 0
            conf_sum = 0
            current_failures = []
            
            for text, true_label in labeled_tests:
                predictions = model.predict(text)
                top_label, top_score = predictions[0]
                
                conf_sum += top_score
                
                # Find Rank
                labels_only = [p[0] for p in predictions]
                try:
                    rank = labels_only.index(true_label) + 1
                except ValueError:
                    rank = 100
                
                # Scoring Logic
                if rank == 1:
                    total_score += top_score
                    correct_count += 1
                else:
                    penalty = top_score * (rank - 1)
                    total_score -= penalty
                    
                    current_failures.append({
                        "text": text,
                        "pred": top_label,
                        "true": true_label,
                        "rank": rank,
                        "conf": top_score
                    })
            
            accuracy = (correct_count / len(labeled_tests)) * 100
            avg_conf = conf_sum / len(labeled_tests)
            
            print(f"{model_name:<45} | {k:<3} | {accuracy:.1f}%   | {total_score:>7.2f}  | {avg_conf:.3f}")
            
            failures_log[f"{model_name} (K={k})"] = current_failures
            
    except Exception as e:
        print(f"{model_name:<45} | FAILED TO LOAD: {e}")

print("\n" + "="*90)
print("‚ùå DETAILED FAILURE LOG (Worst Errors)")
print("="*90)

for config, errors in failures_log.items():
    if not errors:
        continue
        
    print(f"\nüîπ {config}: {len(errors)} Errors")
    
    # Sort by Rank (worst failures first)
    errors.sort(key=lambda x: x['rank'], reverse=True) 
    
    for err in errors[:10]:
        print(f"   ‚Ä¢ '{err['text']}' -> Predicted: {err['pred']} ({err['conf']:.2f}) | True: {err['true']} (Rank #{err['rank']})")

  from .autonotebook import tqdm as notebook_tqdm


MODEL                                         | K   | ACCURACY | SCORE    | AVG CONF
------------------------------------------------------------------------------------------
BAAI/bge-base-en-v1.5                         | 1   | 90.0%   |   42.17  | 0.870
BAAI/bge-base-en-v1.5                         | 3   | 88.3%   |   38.24  | 0.796
BAAI/bge-base-en-v1.5                         | 5   | 90.0%   |   37.50  | 0.757
BAAI/bge-base-en-v1.5                         | 7   | 91.7%   |   37.41  | 0.732
BAAI/bge-base-en-v1.5                         | 9   | 90.0%   |   35.10  | 0.711
BAAI/bge-small-en-v1.5                        | 1   | 83.3%   |   32.12  | 0.888
BAAI/bge-small-en-v1.5                        | 3   | 86.7%   |   36.89  | 0.820
BAAI/bge-small-en-v1.5                        | 5   | 86.7%   |   35.26  | 0.786
BAAI/bge-small-en-v1.5                        | 7   | 86.7%   |   33.60  | 0.764
BAAI/bge-small-en-v1.5                        | 9   | 91.7%   |   36.52  | 0.744
TaylorAI/gte-t

In [6]:
import torch
import coremltools as ct
from transformers import AutoModel, AutoTokenizer
import numpy as np
import os
import shutil

# --- CONFIGURATION ---
model_id = "sentence-transformers/all-MiniLM-L6-v2"
output_path = "MiniLM.mlpackage"
vocab_path = "vocab.txt"

# --- STEP 0: CLEANUP (CRITICAL) ---
# Delete old files to prevent the "FileNotFound" or "FileExists" errors
if os.path.exists(output_path):
    print(f"üßπ Deleting corrupted {output_path}...")
    shutil.rmtree(output_path)

if os.path.exists(vocab_path):
    os.remove(vocab_path)

# --- STEP 1: DOWNLOAD & PREPARE ---
print(f"‚¨áÔ∏è Downloading {model_id}...")
tokenizer = AutoTokenizer.from_pretrained(model_id)
# Force CPU to avoid architecture conflicts during export
base_model = AutoModel.from_pretrained(model_id, return_dict=False).cpu()
base_model.eval()

# --- STEP 2: WRAPPER (MEAN POOLING) ---
class WrappedModel(torch.nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        token_embeddings = outputs[0]
        
        # Mean Pooling Math
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        return sum_embeddings / sum_mask

traced_model = WrappedModel(base_model)

# --- STEP 3: TRACE ---
print("üïµÔ∏è Tracing model graph...")
example_text = "Hello world"
tokens = tokenizer(example_text, return_tensors="pt", padding="max_length", max_length=128)
input_ids = tokens["input_ids"].cpu()
attention_mask = tokens["attention_mask"].cpu()

traced_graph = torch.jit.trace(traced_model, (input_ids, attention_mask))

# --- STEP 4: CONVERT TO COREML ---
print("üçè Converting to CoreML...")
model = ct.convert(
    traced_graph,
    inputs=[
        ct.TensorType(name="input_ids", shape=(1, 128), dtype=np.int32),
        ct.TensorType(name="attention_mask", shape=(1, 128), dtype=np.int32)
    ],
    outputs=[ct.TensorType(name="embeddings")],
    compute_units=ct.ComputeUnit.ALL,
    minimum_deployment_target=ct.target.iOS16
)

# --- STEP 5: COMPRESS TO INT8 ---
print("üóúÔ∏è Compressing weights...")
try:
    from coremltools.optimize.coreml import (
        linear_quantize_weights,
        OpLinearQuantizerConfig,
        OptimizationConfig
    )

    # Configure quantization
    op_config = OpLinearQuantizerConfig(
        mode="linear_symmetric",
        weight_threshold=512
    )
    config = OptimizationConfig(global_config=op_config)
    
    # Apply
    model = linear_quantize_weights(model, config=config)
    
except ImportError:
    print("‚ö†Ô∏è Modern optimization API missing. Falling back to legacy...")
    from coremltools.models.neural_network import quantization_utils
    model = quantization_utils.quantize_weights(model, nbits=8)

# --- STEP 6: SAVE ---
model.save(output_path)
print(f"‚úÖ Saved Model: {output_path}")

# --- STEP 7: SAVE VOCAB ---
print("üìñ Generating vocab.txt...")
vocab = tokenizer.get_vocab()
with open(vocab_path, "w", encoding="utf-8") as f:
    sorted_vocab = sorted(vocab.items(), key=lambda item: item[1])
    for word, index in sorted_vocab:
        f.write(word + "\n")

print(f"‚úÖ Saved Vocab: {vocab_path}")
print("\nüéâ SUCCESS! Drag 'MiniLM.mlpackage' and 'vocab.txt' into Xcode.")

‚¨áÔ∏è Downloading sentence-transformers/all-MiniLM-L6-v2...
üïµÔ∏è Tracing model graph...


Model is not in eval mode. Consider calling '.eval()' on your model prior to conversion


üçè Converting to CoreML...


Converting PyTorch Frontend ==> MIL Ops:   0%|          | 0/273 [00:00<?, ? ops/s]Core ML embedding (gather) layer does not support any inputs besides the weights and indices. Those given will be ignored.
Converting PyTorch Frontend ==> MIL Ops: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 272/273 [00:00<00:00, 5186.51 ops/s]
Running MIL frontend_pytorch pipeline: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:00<00:00, 215.48 passes/s]
Running MIL default pipeline: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 95/95 [00:01<00:00, 82.17 passes/s] 
Running MIL backend_mlprogram pipeline: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 12/12 [00:00<00:00, 265.89 passes/s]


üóúÔ∏è Compressing weights...


Running compression pass linear_quantize_weights: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 101/101 [00:00<00:00, 152.36 ops/s]
Running MIL frontend_milinternal pipeline: 0 passes [00:00, ? passes/s]
Running MIL default pipeline: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 92/92 [00:00<00:00, 189.03 passes/s]
Running MIL backend_mlprogram pipeline: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 12/12 [00:00<00:00, 242.42 passes/s]


FileNotFoundError: [Errno 2] No such file or directory: 'MiniLM.mlpackage'