## Dataset Preparation

In [16]:
search_queries = [
    "buysmartphoneonline",
    "kidsfashionoffer", 
    "applewatchseries6",
    "mensjeanslarge",
    "blueshoeswomens",
    "discountlaptoptech",
    "blackfridaydeals"
]

product_vocabulary = {
    "buy", "sell", "shop", "purchase", "online", "store", "market",
    "smartphone", "phone", "mobile", "iphone", "samsung", "apple", 
    "watch", "series", "series6", "laptop", "computer", "tech", 
    "jeans", "shoes", "shirt", "dress", "pants", "jacket", "fashion",
    "kids", "mens", "womens", "unisex", "boys", "girls",
    "large", "medium", "small", "xl", "xxl", "size",
    "blue", "red", "black", "white", "green", "yellow", "color",
    "discount", "offer", "sale", "deals", "cheap", "price",
    "friday", "monday", "weekend", "special", "new", "latest"
}

print("Search Queries:")
for i, query in enumerate(search_queries, 1):
    print(f"{i}. {query}")

print(f"\nVocabulary Size: {len(product_vocabulary)} words")
print("Sample vocabulary:", list(product_vocabulary)[:10])

Search Queries:
1. buysmartphoneonline
2. kidsfashionoffer
3. applewatchseries6
4. mensjeanslarge
5. blueshoeswomens
6. discountlaptoptech
7. blackfridaydeals

Vocabulary Size: 57 words
Sample vocabulary: ['friday', 'discount', 'market', 'large', 'dress', 'price', 'mobile', 'mens', 'yellow', 'series']


## Algorithm Implementation

In [17]:
def segment_query(query, vocabulary):
    query = query.lower()
    n = len(query)
    dp = [False] * (n + 1)
    dp[0] = True
    parent = [-1] * (n + 1)
    
    for i in range(1, n + 1):
        for j in range(i):
            if dp[j] and query[j:i] in vocabulary:
                dp[i] = True
                parent[i] = j
                break
    
    if not dp[n]:
        return None
    
    result = []
    i = n
    while i > 0:
        j = parent[i]
        result.append(query[j:i])
        i = j
    
    return result[::-1]

def segment_with_fallback(query, vocabulary):
    segments = segment_query(query, vocabulary)
    if segments:
        return segments, True
    
    query = query.lower()
    result = []
    i = 0
    
    while i < len(query):
        found = False
        for length in range(len(query) - i, 0, -1):
            word = query[i:i + length]
            if word in vocabulary:
                result.append(word)
                i += length
                found = True
                break
        
        if not found:
            char = query[i]
            if result and len(result[-1]) == 1 and result[-1] not in vocabulary:
                result[-1] += char
            else:
                result.append(char)
            i += 1
    
    return result, False

## Experimentation

In [18]:
print("Query Segmentation Results:")
print("=" * 50)

for query in search_queries:
    segments, is_perfect = segment_with_fallback(query, product_vocabulary)
    status = "✓ Perfect" if is_perfect else "⚠ Partial"
    
    print(f"Query: {query}")
    print(f"Segments: {' | '.join(segments)}")
    print(f"Status: {status}")
    print("-" * 30)

Query Segmentation Results:
Query: buysmartphoneonline
Segments: buy | smartphone | online
Status: ✓ Perfect
------------------------------
Query: kidsfashionoffer
Segments: kids | fashion | offer
Status: ✓ Perfect
------------------------------
Query: applewatchseries6
Segments: apple | watch | series6
Status: ✓ Perfect
------------------------------
Query: mensjeanslarge
Segments: mens | jeans | large
Status: ✓ Perfect
------------------------------
Query: blueshoeswomens
Segments: blue | shoes | womens
Status: ✓ Perfect
------------------------------
Query: discountlaptoptech
Segments: discount | laptop | tech
Status: ✓ Perfect
------------------------------
Query: blackfridaydeals
Segments: black | friday | deals
Status: ✓ Perfect
------------------------------


## Testing with Unknown Words

In [19]:
test_queries_unknown = [
    "nikerunningshoessale",
    "amazonprimevideo", 
    "teslamodelycar"
]

print("Testing with Queries Containing Unknown Words:")
print("=" * 50)

for query in test_queries_unknown:
    segments, is_perfect = segment_with_fallback(query, product_vocabulary)
    status = "✓ Perfect" if is_perfect else "⚠ Contains unknown words"
    
    print(f"Query: {query}")
    print(f"Segments: {' | '.join(segments)}")
    print(f"Status: {status}")
    print("-" * 30)

Testing with Queries Containing Unknown Words:
Query: nikerunningshoessale
Segments: ni | ke | ru | nn | in | g | shoes | sale
Status: ⚠ Contains unknown words
------------------------------
Query: amazonprimevideo
Segments: am | az | on | pr | im | ev | id | eo
Status: ⚠ Contains unknown words
------------------------------
Query: teslamodelycar
Segments: te | sl | am | od | el | yc | ar
Status: ⚠ Contains unknown words
------------------------------


## Analysis and Improvements

In [20]:
def analyze_segmentation_quality(queries, vocabulary):
    perfect_count = 0
    partial_count = 0
    
    for query in queries:
        segments, is_perfect = segment_with_fallback(query, vocabulary)
        if is_perfect:
            perfect_count += 1
        else:
            partial_count += 1
    
    total = len(queries)
    perfect_ratio = perfect_count / total * 100
    
    print(f"Segmentation Quality Analysis:")
    print(f"Total queries: {total}")
    print(f"Perfect segmentation: {perfect_count} ({perfect_ratio:.1f}%)")
    print(f"Partial segmentation: {partial_count} ({100-perfect_ratio:.1f}%)")
    
    return perfect_ratio

all_queries = search_queries + test_queries_unknown
quality_score = analyze_segmentation_quality(all_queries, product_vocabulary)

print("\nSuggested Improvements:")
print("1. Expand vocabulary with brand names (nike, amazon, tesla)")
print("2. Add technical terms (running, prime, video, model, car)")
print("3. Use machine learning for context-aware segmentation")
print("4. Implement fuzzy matching for misspelled words")
print("5. Add compound word detection algorithms")

Segmentation Quality Analysis:
Total queries: 10
Perfect segmentation: 7 (70.0%)
Partial segmentation: 3 (30.0%)

Suggested Improvements:
1. Expand vocabulary with brand names (nike, amazon, tesla)
2. Add technical terms (running, prime, video, model, car)
3. Use machine learning for context-aware segmentation
4. Implement fuzzy matching for misspelled words
5. Add compound word detection algorithms
