In [None]:
# GPU-Accelerated Transformer-based Classifier
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import numpy as np
from google.colab import files
import json

# Check GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Fix CSV loading with encoding issues
file_path = '/content/drive/MyDrive/Food additives RAG+LLM/branded_food_short.csv'

# Try different encodings
encodings_to_try = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1', 'utf-16']

df = None
for encoding in encodings_to_try:
    try:
        print(f"Trying encoding: {encoding}")
        df = pd.read_csv(file_path, encoding=encoding)
        print(f"Successfully loaded with encoding: {encoding}")
        break
    except UnicodeDecodeError:
        print(f"Failed with encoding: {encoding}")
        continue
    except Exception as e:
        print(f"Other error with {encoding}: {e}")
        continue

if df is None:
    print("All encodings failed. Trying with error handling...")
    df = pd.read_csv(file_path, encoding='utf-8', errors='ignore')

print(f"Loaded dataframe with shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

Using device: cuda
Trying encoding: utf-8
Failed with encoding: utf-8
Trying encoding: latin-1
Successfully loaded with encoding: latin-1
Loaded dataframe with shape: (1048575, 7)
Columns: ['fdc_id', 'brand_owner', 'brand_name', 'subbrand_name', 'ingredients', 'branded_food_category', 'description']


  df = pd.read_csv(file_path, encoding=encoding)


In [None]:

# Data preprocessing
df_clean = df.dropna(subset=['ingredients', 'branded_food_category'])
df_clean['ingredients_clean'] = df_clean['ingredients'].str.lower()

# Encode labels
label_encoder = LabelEncoder()
df_clean['labels'] = label_encoder.fit_transform(df_clean['branded_food_category'])

# Split data
train_df, test_df = train_test_split(df_clean, test_size=0.1, random_state=42)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['ingredients_clean'] = df_clean['ingredients'].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['labels'] = label_encoder.fit_transform(df_clean['branded_food_category'])


In [None]:

# Use a lightweight pre-trained model
model_name = "distilbert-base-uncased"  # Fast and effective
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize data
def tokenize_function(examples):
    return tokenizer(examples, truncation=True, padding=True, max_length=256)

# Create datasets
class FoodDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label = self.labels.iloc[idx]

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Create datasets
train_dataset = FoodDataset(
    train_df['ingredients_clean'],
    train_df['labels'],
    tokenizer
)

test_dataset = FoodDataset(
    test_df['ingredients_clean'],
    test_df['labels'],
    tokenizer
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:

# Load model
num_labels = len(label_encoder.classes_)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels
)

# Training arguments (optimized for GPU)
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=500,  # Adjust based on GPU memory
    per_device_eval_batch_size=500,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    eval_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    load_best_model_at_end=True,
    fp16=True,  # Use mixed precision for faster training
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
)

# Train model (GPU-accelerated)
trainer.train()


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
[34m[1mwandb[0m: Currently logged in as: [33myihangfeng5[0m ([33myihangfeng5-university-of-connecticut[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss
500,1.4265,1.178782
1000,0.779,0.690233
1500,0.6346,0.569601
2000,0.5272,0.507589
2500,0.4957,0.459939
3000,0.4586,0.425077
3500,0.4263,0.396741
4000,0.3704,0.375052
4500,0.3547,0.359252
5000,0.353,0.345908


TrainOutput(global_step=5583, training_loss=0.7411778183707665, metrics={'train_runtime': 4492.6963, 'train_samples_per_second': 621.173, 'train_steps_per_second': 1.243, 'total_flos': 1.8568167684374477e+17, 'train_loss': 0.7411778183707665, 'epoch': 3.0})

In [None]:

# Evaluate
predictions = trainer.predict(test_dataset)
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = test_df['labels'].values

accuracy = accuracy_score(y_true, y_pred)
print(f"Test Accuracy: {accuracy:.3f}")

# Save model for potential conversion to CoreML
model.save_pretrained('./food_classifier_transformer')
tokenizer.save_pretrained('./food_classifier_transformer')

# Export configuration
model_config = {
    'model_type': 'transformer',
    'model_name': model_name,
    'class_names': label_encoder.classes_.tolist(),
    'num_labels': num_labels
}

with open('transformer_config.json', 'w') as f:
    json.dump(model_config, f, indent=2)

files.download('transformer_config.json')
print("Model saved. You can convert to CoreML for iOS integration.")

Test Accuracy: 0.908


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Model saved. You can convert to CoreML for iOS integration.


In [None]:
import coremltools as ct
import torch
import numpy as np

# Load your trained model
model.eval()

# Create a wrapper class that returns only logits
class ModelWrapper(torch.nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        return outputs.logits  # Return only logits tensor, not dictionary

# Create wrapped model
wrapped_model = ModelWrapper(model)
wrapped_model.eval()

# Create dummy input tensors (tokenized input)
batch_size = 1
max_length = 256
dummy_input_ids = torch.randint(0, tokenizer.vocab_size, (batch_size, max_length))
dummy_attention_mask = torch.ones(batch_size, max_length, dtype=torch.long)

# Trace the wrapped model
with torch.no_grad():
    traced_model = torch.jit.trace(
        wrapped_model,
        (dummy_input_ids, dummy_attention_mask)
    )

# Convert to Core ML
mlmodel = ct.convert(
    traced_model,
    inputs=[
        ct.TensorType(name="input_ids", shape=(1, max_length), dtype=np.int32),
        ct.TensorType(name="attention_mask", shape=(1, max_length), dtype=np.int32)
    ],
    outputs=[ct.TensorType(name="logits")],
    minimum_deployment_target=ct.target.iOS15,
    convert_to="mlprogram"
)

# Add metadata
mlmodel.short_description = "Food Category Classifier - Token Input"
mlmodel.author = "YF"
mlmodel.license = "MIT"
mlmodel.version = "1.0"

# Add descriptions
mlmodel.input_description["input_ids"] = "Tokenized input IDs"
mlmodel.input_description["attention_mask"] = "Attention mask"
mlmodel.output_description["logits"] = "Classification logits"

# Save the model
mlmodel.save("FoodClassifier.mlpackage")
print("Core ML model saved successfully!")

Converting PyTorch Frontend ==> MIL Ops: 100%|█████████▉| 225/226 [00:00<00:00, 3043.08 ops/s]
Running MIL frontend_pytorch pipeline: 100%|██████████| 5/5 [00:00<00:00, 190.50 passes/s]
  return input_var.val.astype(dtype=string_to_nptype(dtype_val))
  max(cur_range.low, tmp_range.low), min(cur_range.high, tmp_range.high)
Running MIL default pipeline: 100%|██████████| 89/89 [00:02<00:00, 29.80 passes/s]
Running MIL backend_mlprogram pipeline: 100%|██████████| 12/12 [00:00<00:00, 224.23 passes/s]


Core ML model saved successfully!


In [None]:
# Save tokenizer vocabulary and configuration for Swift
tokenizer_config = {
    "vocab_size": tokenizer.vocab_size,
    "max_length": 256,
    "vocab": tokenizer.get_vocab(),
    "class_labels": label_encoder.classes_.tolist(),
    "special_tokens": {
        "pad_token": tokenizer.pad_token,
        "pad_token_id": tokenizer.pad_token_id,
        "cls_token": tokenizer.cls_token,
        "cls_token_id": tokenizer.cls_token_id,
        "sep_token": tokenizer.sep_token,
        "sep_token_id": tokenizer.sep_token_id,
        "unk_token": tokenizer.unk_token,
        "unk_token_id": tokenizer.unk_token_id
    }
}

import json
with open('tokenizer_config.json', 'w') as f:
    json.dump(tokenizer_config, f, indent=2)

files.download('tokenizer_config.json')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Test the converted model
def test_coreml_model():
    try:
        # Load the Core ML model
        loaded_model = ct.models.MLModel("FoodClassifier.mlpackage")

        # Test with sample input
        test_text = "sugar, flour, eggs, vanilla extract"

        # Tokenize the input
        encoded = tokenizer(
            test_text,
            truncation=True,
            padding='max_length',
            max_length=256,
            return_tensors='np'
        )

        # Make prediction
        prediction = loaded_model.predict({
            'input_ids': encoded['input_ids'].astype(np.int32),
            'attention_mask': encoded['attention_mask'].astype(np.int32)
        })

        # Get predicted class
        logits = prediction['logits'][0]  # Remove batch dimension
        predicted_class_idx = np.argmax(logits)
        predicted_class = label_encoder.classes_[predicted_class_idx]

        print(f"Input: {test_text}")
        print(f"Predicted category: {predicted_class}")
        print(f"Confidence scores shape: {logits.shape}")
        print(f"Top 3 predictions:")

        # Show top 3 predictions
        top_indices = np.argsort(logits)[-3:][::-1]
        for i, idx in enumerate(top_indices):
            print(f"  {i+1}. {label_encoder.classes_[idx]}: {logits[idx]:.4f}")

    except Exception as e:
        print(f"Test failed: {e}")

test_coreml_model()

Test failed: Model prediction is only supported on macOS version 10.13 or later.


In [None]:
# Compress and download all files
import shutil
import os

# Create a folder with all necessary files
os.makedirs('FoodClassifierBundle', exist_ok=True)

# Copy Core ML model
if os.path.exists('FoodClassifier.mlpackage'):
    shutil.copytree('FoodClassifier.mlpackage', 'FoodClassifierBundle/FoodClassifier.mlpackage')

# Copy config files
shutil.copy('tokenizer_config.json', 'FoodClassifierBundle/')
shutil.copy('transformer_config.json', 'FoodClassifierBundle/')

# Create bundle
shutil.make_archive('FoodClassifierBundle', 'zip', '.', 'FoodClassifierBundle')
files.download('FoodClassifierBundle.zip')

print("Complete bundle downloaded!")
print("Bundle contains:")
print("- FoodClassifier.mlpackage (Core ML model)")
print("- tokenizer_config.json (vocabulary and tokenizer settings)")
print("- transformer_config.json (model configuration)")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Complete bundle downloaded!
Bundle contains:
- FoodClassifier.mlpackage (Core ML model)
- tokenizer_config.json (vocabulary and tokenizer settings)
- transformer_config.json (model configuration)


In [2]:
import pandas as pd

# Load the CSV file with Latin-1 encoding
df = pd.read_csv('/content/branded_food_short.csv', encoding='latin-1')

# Extract the 'branded_food_category' column (6th column)
category_column = df.columns[5]

# Get unique categories and their count
unique_categories = df[category_column].unique()
num_unique_categories = len(unique_categories)

# Display the results
print(f"Number of unique categories: {num_unique_categories}") # including nan
print("Unique categories:")
print(unique_categories)

Number of unique categories: 258
Unique categories:
['Oils Edible' 'Herbs/Spices/Extracts' 'Prepared Soups'
 'Sauces/Spreads/Dips/Condiments' 'Dough Based Products / Meals'
 'Vegetables  Prepared/Processed' 'Bread' 'Biscuits/Cookies'
 'Sweet Bakery Products' 'Savoury Bakery Products'
 'Non Alcoholic Beverages  Ready to Drink'
 'Meat/Poultry/Other Animals  Unprepared/Unprocessed'
 'Meat/Poultry/Other Animals  Prepared/Processed'
 'Fruit  Prepared/Processed' 'Cookies & Biscuits'
 'Frozen Fruit & Fruit Juice Concentrates'
 'Popcorn, Peanuts, Seeds & Related Snacks'
 "Frozen Appetizers & Hors D'oeuvres" 'Wholesome Snacks'
 'Nut & Seed Butters' 'Chips, Pretzels & Snacks' 'Cheese' 'Rice'
 'Sausages, Hotdogs & Brats' 'Canned Fruit' 'Frozen Vegetables'
 'Crackers & Biscotti' 'Cooked & Prepared' 'Frozen Dinners & Entrees'
 'Other Frozen Desserts' 'Snack, Energy & Granola Bars'
 'Oriental, Mexican & Ethnic Sauces' 'Breads & Buns'
 'Seasoning Mixes, Salts, Marinades & Tenderizers'
 'Croissants, S

  df = pd.read_csv('/content/branded_food_short.csv', encoding='latin-1')
