In [9]:
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# pre-trained NLP model with word embeddings
nlp = spacy.load("en_core_web_md")

In [3]:
categories = {
    "Transportation": "items related to the movement of people or goods, including vehicles, fuels, and public transport",
    "Plastics": "materials or items made of synthetic polymers, such as plastic bottles, bags, and containers",
    "Energy Consumption": "items or activities related to the usage of energy, such as gas, electricity, or fossil fuels",
    "Food production": "items related to growing, harvesting, or producing food, including agriculture, farming, and food processing",
    "Water usage": "items or activities related to the consumption or conservation of water, such as irrigation, plumbing, or water bills",
    "Housing and Construction": "items related to buildings, homes, or construction materials, including apartments, cement, and insulation",
    "Clothing and Textiles": "items related to garments, fabrics, or textile production, such as shirts, jeans, and sewing materials",
    "Travel and Tourism": "items or activities related to leisure travel, including plane tickets, hotels, and tourist attractions",
    "Personal care and Hygiene": "items used for personal grooming or hygiene, such as soap, shampoo, and cosmetics",
    "Packaging and Shipping": "items related to packaging materials or the shipping of goods, such as boxes, packaging tape, and crates", 
    "Technology": "items related to technology, such as tv, and phone, or technology services, such as chatgpt"
}

In [12]:
category_descriptions = list(categories.values())
category_names = list(categories.keys())

# Define a function to categorize items using TF-IDF and cosine similarity
def categorize_item_tfidf(item, categories):

    category_descriptions = list(categories.values())
    category_names = list(categories.keys())
    # Create the TF-IDF vectorizer
    vectorizer = TfidfVectorizer()

    # Combine the item and the category descriptions for vectorization
    all_texts = [item] + category_descriptions

    # Vectorize the item and category descriptions
    tfidf_matrix = vectorizer.fit_transform(all_texts)

    # Compute the cosine similarity between the item vector and category vectors
    cosine_similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])

    # Find the category with the highest similarity score
    highest_similarity_index = cosine_similarities.argmax()
    best_category = category_names[highest_similarity_index]

    return best_category

In [4]:
def categorize_item(item, categories):
    item_doc = nlp(item.lower())
    best_match = None
    best_score = -1
    for category, description in categories.items():
        category_doc = nlp(description)
        similarity = item_doc.similarity(category_doc)
        if similarity > best_score:
            best_score = similarity
            best_match = category
    return best_match


In [14]:
def categorize_item(item, categories):
    category_descriptions = list(categories.values())
    category_names = list(categories.keys())
    # Create the TF-IDF vectorizer
    vectorizer = TfidfVectorizer()

    # Combine the item and the category descriptions for vectorization
    all_texts = [item] + category_descriptions

    # Vectorize the item and category descriptions
    tfidf_matrix = vectorizer.fit_transform(all_texts)

    # Compute the cosine similarity between the item vector and category vectors
    cosine_similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])

    # Find the category with the highest similarity score
    highest_similarity_index = cosine_similarities.argmax()
    best_category = category_names[highest_similarity_index]

    return best_category


In [13]:
categorize_item_tfidf("chatgpt", categories)

'Technology'

In [15]:
# List of items to categorize
items = ["gas", "plastic bottle", "compost", "toxic chemicals", "chatgpt"]

# Categorize each item
categorized_items = [(item, categorize_item(item, categories)) for item in items]

for item, category in categorized_items:
    print(f"Item: {item} -> Category: {category}")


Item: gas -> Category: Energy Consumption
Item: plastic bottle -> Category: Plastics
Item: compost -> Category: Transportation
Item: toxic chemicals -> Category: Transportation
Item: chatgpt -> Category: Technology
