## Importing two LLMs from transformers library to use for classification:

### The first LLM is facebook bart-large-mnli

In [None]:
from transformers import pipeline

# Initialize the zero-shot classifier
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Define your categories
categories = ["food", "clothing", "transport", "alcohol/entertainment", "home/appliances", "other"]

# Example grocery items
items = ["chiken plate", "lyft", "creamed corn", "cup", "lingerie", "sofa", "dresser"]

# Classify each item
for item in items:
    result = classifier(item, categories)
    print(f"{item} -> {result['labels'][0]} (confidence: {result['scores'][0]:.2f})")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu


chiken plate -> food (confidence: 0.79)
lyft -> transport (confidence: 0.86)
creamed corn -> food (confidence: 0.94)
cup -> other (confidence: 0.49)
lingerie -> clothing (confidence: 0.47)
sofa -> other (confidence: 0.94)
dresser -> clothing (confidence: 0.54)


### The second model is all-MiniLM-L6-v2

In [None]:

from sentence_transformers import SentenceTransformer, util

# Load a pretrained model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Define your categories
categories = ["food", "clothing", "transport", "alcohol/entertainment", "home/appliances", "other"]

# Example grocery items
items = ["chiken plate", "lyft", "creamed corn", "cup", "lingerie", "sofa", "dresser"]

# Compute embeddings
category_embeddings = model.encode(categories)
item_embeddings = model.encode(items)

# Assign each item to the closest category
for item, item_emb in zip(items, item_embeddings):
    scores = util.pytorch_cos_sim(item_emb, category_embeddings)[0]
    best_category = categories[scores.argmax()]
    print(f"{item} -> {best_category}")



modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

chiken plate -> food
lyft -> transport
creamed corn -> food
cup -> food
lingerie -> clothing
sofa -> clothing
dresser -> clothing


### Testing the model accuracies
we made a dataset of 500 words with their categories and to assess the accuracy of the models

the main issue with this model so far is that it takes a lot of time to run (over 2o minutes for 500 items)

In [None]:
import csv

# Initialize lists
X = []  # List to store words
y = []  # List to store labels (categories)

# Read the CSV file
csv_filename = "receipt_dataset.csv"
with open(csv_filename, mode="r") as file:
    reader = csv.reader(file)
    next(reader)  # Skip the header row
    for row in reader:
        X.append(row[0])  # First column: Item name
        y.append(row[1])  # Second column: Category

# Print sample output
print("Sample X:", X[:10])
print("Sample y:", y[:10])

zero_shot_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

result = zero_shot_classifier(X,categories)

predictions = [result[i]['labels'][0].lower() for i in range(len(result)) ]

acc = [predictions[i].lower()==y[i].lower() for i in range(len(y))]
y = [y[i].lower() for i in range(len(y))]

print('facebook bart model accuracy: ',np.mean(acc))

Sample X: ['umbrella', 'sweater', 'map', 'jacket', 'toaster', 'jeans', 'chicken', 'vacuum', 'shoes', 'motorcycle']
Sample y: ['other', 'clothing', 'other', 'clothing', 'home/appliances', 'clothing', 'food', 'home/appliances', 'clothing', 'transport']


Device set to use cpu


facebook bart model accuracy:  0.758


In [None]:
import csv
import numpy as np
from sentence_transformers import SentenceTransformer, util

# Load the Sentence Similarity model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Initialize lists
X = []  # List to store words
y = []  # List to store labels (categories)

# Read the CSV file
csv_filename = "receipt_dataset.csv"
with open(csv_filename, mode="r") as file:
    reader = csv.reader(file)
    next(reader)  # Skip the header row
    for row in reader:
        X.append(row[0])  # First column: Item name
        y.append(row[1])  # Second column: Category

# Define category labels
categories = ["food", "clothing", "transport", "alcohol/entertainment", "home/appliances", "other"]

# Encode categories and items using the SentenceTransformer model
category_embeddings = model.encode(categories, convert_to_tensor=True)
X_embeddings = model.encode(X, convert_to_tensor=True)

# Compute cosine similarity and predict categories
predictions = []
for embedding in X_embeddings:
    similarities = util.cos_sim(embedding, category_embeddings)
    predicted_category = categories[similarities.argmax().item()]
    predictions.append(predicted_category.lower())

# Compute accuracy
y = [label.lower() for label in y]
accuracy = np.mean([predictions[i] == y[i] for i in range(len(y))])

print("all-MiniLM-L6-v2 model accuracy:", accuracy)


all-MiniLM-L6-v2 model accuracy: 0.684


### GPT3.5 turbo model
the problem with this model is that I created this dataset from the Chatgpt so the model performance is too high and might have generalizability issue

The biggest problem with this model is that we have to pay for it.

In [None]:
import csv
import numpy as np
import openai

# OpenAI API Key (Ensure you have access to GPT-3.5 Turbo)
#openai.api_key = ####REDACTED#######

def classify_with_gpt3(item, categories):
    prompt = f"Classify the following item into one of these categories: {', '.join(categories)}.\nItem: {item}\nCategory:"
    response = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that classifies items into categories."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=10
    )
    return response.choices[0].message.content.strip().lower()

# Initialize lists
X = []  # List to store words
y = []  # List to store labels (categories)

# Read the CSV file
csv_filename = "receipt_dataset.csv"
with open(csv_filename, mode="r") as file:
    reader = csv.reader(file)
    next(reader)  # Skip the header row
    for row in reader:
        X.append(row[0])  # First column: Item name
        y.append(row[1])  # Second column: Category

# Define category labels
categories = ["food", "clothing", "transport", "alcohol/entertainment", "home/appliances", "other"]

# Get predictions from GPT-3.5 Turbo
predictions = [classify_with_gpt3(item, categories) for item in X]

# Compute accuracy
y = [label.lower() for label in y]
accuracy = np.mean([predictions[i] == y[i] for i in range(len(y))])

print("GPT-3.5 Model accuracy:", accuracy)


RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

#### confusion matrix:

In [None]:
from sklearn.metrics import confusion_matrix
import numpy as np
import matplotlib.pyplot as plt

# Example lists
# y = [0, 1, 2, 2, 0, 1, 1, 0, 2]  # True labels
# prediction = [0, 1, 2, 0, 0, 1, 1, 2, 2]  # Predicted labels

# Create confusion matrix
cm = confusion_matrix(y, predictions)

# Optionally, print it nicely
print("Confusion Matrix:")
print(cm)
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)

plt.title('Confusion Matrix')
plt.colorbar()
plt.xticks(np.arange(len(categories)), categories, rotation=45)
plt.yticks(np.arange(len(categories)), categories)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()


In [None]:
y[1]

In [None]:
import pandas as pd
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util

# Load dataset
file_path = "/content/receipt_dataset.csv"  # Update with your actual file path
df = pd.read_csv(file_path)

# Ensure correct column names
df.columns = ["item", "price"]

# Convert price to float
df["price"] = df["price"].astype(float)

# Define categories
categories = ["food", "clothing", "transport", "alcohol/entertainment", "home/appliances", "other"]

### 🔹 Zero-Shot Classification ###
zero_shot_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

def classify_zero_shot(item):
    result = zero_shot_classifier(item, categories)
    return result["labels"][0]  # Category with highest confidence

df["zero_shot_category"] = df["item"].apply(classify_zero_shot)

### 🔹 Sentence Similarity Classification ###
semantic_model = SentenceTransformer("all-MiniLM-L6-v2")
category_embeddings = semantic_model.encode(categories)

def classify_semantic(item):
    item_embedding = semantic_model.encode(item)
    scores = util.pytorch_cos_sim(item_embedding, category_embeddings)[0]
    return categories[scores.argmax()]  # Closest category

df["semantic_category"] = df["item"].apply(classify_semantic)

# Aggregate total spending per category
zero_shot_spending = df.groupby("zero_shot_category")["price"].sum()
semantic_spending = df.groupby("semantic_category")["price"].sum()

# Save results
df.to_csv("classified_grocery_receipts.csv", index=False)
zero_shot_spending.to_csv("zero_shot_spending.csv")
semantic_spending.to_csv("semantic_spending.csv")

print("Classification completed! Results saved to CSV files.")


ValueError: could not convert string to float: 'other'