In [1]:
import pandas as pd
import numpy as np


In [9]:
metrics_df = pd.read_csv('data/merged_results.csv')
metrics_df.head()

Unnamed: 0,ORIGINAL_ITEM_TEXT,FIDO_TYPE,DESCRIPTION,TIER1_NAME,BRAND_x,FIDO,PRODUCT_NUMBER,SAMPLE_STORE,SAMPLE_RECEIPT,ITEM_COUNT,MEDIAN(PAM.FINAL_PRICE),GMV,BRAND_y,BRAND_SCORE,CATEGORY,CATEGORY_SCORE,EXPANDED_DESCRIPTION,REASONING_SNIPPET
0,strawberries,PRODUCT,Strawberries,Produce,UNBRANDED,d18956ad-4eaa-408d-9620-26256eed1e4f,27003.0,COSTCO,6984257f-ae79-417b-9407-9d7fa9135954,348691,4.49,1565622.59,Unspecified,low,Fresh Produce,high,"Premium Strawberries, 2 lb clamshell",The product description 'strawberries' corresp...
1,***kswtr40pk,PRODUCT,Kirkland Signature Purified Drinking Water,Beverages,KIRKLAND SIGNATURE,97f8b18c-640d-4105-9a2f-97f726996c5b,782796.0,COSTCO,14094048-0423-4e8e-8fa1-27f78b5823f3,283476,3.99,1131069.24,Kirkland Signature,high,Bottled Water,high,"Kirkland Signature Purified Drinking Water, 16...",The original item text '***kswtr40pk' likely a...
2,rotisserie,PRODUCT,Kirkland Signature Rotisserie Chicken,Deli & Bakery,KIRKLAND SIGNATURE,175c65d7-e9ce-4108-838c-8148098892ec,87745.0,COSTCO,dd890074-a2d9-4aaa-9f1b-9e5c48717e01,258676,4.99,1290793.24,Kirkland Signature,high,Prepared Foods,high,Kirkland Signature Rotisserie Chicken,The term 'rotisserie' refers to a cooking meth...
3,ks cage free,PRODUCT,Kirkland Signature Large Eggs-cage Free-2 Dozen,Dairy,KIRKLAND SIGNATURE,12a677be-8339-42cc-9c15-495d6a80c0c0,637598.0,COSTCO,e2053422-40ba-4bb4-afc0-be69182020f2,189662,6.89,1306771.18,Kirkland Signature,high,Eggs,high,"Kirkland Signature Cage-Free Large Eggs, 24 Count",The original item text 'ks cage free' likely s...
4,18 ct eggs,PRODUCT,Sauders Eggs Large White 18 Ct,Dairy,SAUDER,5875bf31-3674-4ceb-a4cc-437bc46ec250,1008.0,COSTCO,17d39fed-4a3a-463b-82b6-eac9df1823d1,162319,5.19,842435.61,Kirkland Signature,high,Eggs,high,"Kirkland Signature Large Grade A Eggs, 18 Count",The product description '18 ct eggs' refers to...


In [10]:
# Import required libraries
from difflib import SequenceMatcher
from thefuzz import fuzz
import pandas as pd
import numpy as np

def calculate_similarity_metrics(row, col1, col2):
    """Calculate multiple similarity metrics between two strings"""
    str1 = str(row[col1]).lower()
    str2 = str(row[col2]).lower()
    
    # Ratios from different algorithms
    sequence_ratio = SequenceMatcher(None, str1, str2).ratio()
    levenshtein_ratio = fuzz.ratio(str1, str2) / 100
    token_sort_ratio = fuzz.token_sort_ratio(str1, str2) / 100
    
    return pd.Series({
        f'{col1}_vs_{col2}_sequence': sequence_ratio,
        f'{col1}_vs_{col2}_levenshtein': levenshtein_ratio,
        f'{col1}_vs_{col2}_token': token_sort_ratio
    })

# Calculate similarity metrics for both pairs of columns
brand_similarities = metrics_df.apply(
    calculate_similarity_metrics, 
    args=('BRAND_x', 'BRAND_y'), 
    axis=1
)

description_similarities = metrics_df.apply(
    calculate_similarity_metrics, 
    args=('DESCRIPTION', 'EXPANDED_DESCRIPTION'), 
    axis=1
)

# Add similarity scores to the original dataframe
metrics_df = pd.concat([metrics_df, brand_similarities, description_similarities], axis=1)

# Calculate summary statistics
summary_stats = pd.DataFrame({
    'Brand Comparisons': {
        'Mean Sequence Similarity': metrics_df['BRAND_x_vs_BRAND_y_sequence'].mean(),
        'Mean Levenshtein Similarity': metrics_df['BRAND_x_vs_BRAND_y_levenshtein'].mean(),
        'Mean Token Similarity': metrics_df['BRAND_x_vs_BRAND_y_token'].mean()
    },
    'Description Comparisons': {
        'Mean Sequence Similarity': metrics_df['DESCRIPTION_vs_EXPANDED_DESCRIPTION_sequence'].mean(),
        'Mean Levenshtein Similarity': metrics_df['DESCRIPTION_vs_EXPANDED_DESCRIPTION_levenshtein'].mean(),
        'Mean Token Similarity': metrics_df['DESCRIPTION_vs_EXPANDED_DESCRIPTION_token'].mean()
    }
})

# Display summary statistics
print("Similarity Score Summary:")
print(summary_stats)

# Display examples of high and low similarity scores
print("\nExamples of Extreme Cases:")
print("\nLowest Brand Similarity:")
lowest_brand = metrics_df.nsmallest(3, 'BRAND_x_vs_BRAND_y_token')[
    ['BRAND_x', 'BRAND_y', 'BRAND_x_vs_BRAND_y_token']
]
print(lowest_brand)

print("\nHighest Brand Similarity:")
highest_brand = metrics_df.nlargest(3, 'BRAND_x_vs_BRAND_y_token')[
    ['BRAND_x', 'BRAND_y', 'BRAND_x_vs_BRAND_y_token']
]
print(highest_brand)

Similarity Score Summary:
                             Brand Comparisons  Description Comparisons
Mean Sequence Similarity              0.783491                 0.699299
Mean Levenshtein Similarity           0.786050                 0.706750
Mean Token Similarity                 0.791425                 0.764550

Examples of Extreme Cases:

Lowest Brand Similarity:
       BRAND_x     BRAND_y  BRAND_x_vs_BRAND_y_token
13   UNBRANDED      Costco                       0.0
105  UNBRANDED         Oxo                       0.0
221     SUNSIP  Health-Ade                       0.0

Highest Brand Similarity:
              BRAND_x             BRAND_y  BRAND_x_vs_BRAND_y_token
1  KIRKLAND SIGNATURE  Kirkland Signature                       1.0
2  KIRKLAND SIGNATURE  Kirkland Signature                       1.0
3  KIRKLAND SIGNATURE  Kirkland Signature                       1.0


In [11]:
# Calculate weighted average similarity scores
metrics_df['brand_similarity'] = (
    metrics_df['BRAND_x_vs_BRAND_y_sequence'] * 0.3 +
    metrics_df['BRAND_x_vs_BRAND_y_levenshtein'] * 0.3 +
    metrics_df['BRAND_x_vs_BRAND_y_token'] * 0.4
)

metrics_df['description_similarity'] = (
    metrics_df['DESCRIPTION_vs_EXPANDED_DESCRIPTION_sequence'] * 0.3 +
    metrics_df['DESCRIPTION_vs_EXPANDED_DESCRIPTION_levenshtein'] * 0.3 +
    metrics_df['DESCRIPTION_vs_EXPANDED_DESCRIPTION_token'] * 0.4
)

# Display summary of new similarity scores
print("Summary of Combined Similarity Scores:")
summary = pd.DataFrame({
    'Brand Similarity': metrics_df['brand_similarity'].describe(),
    'Description Similarity': metrics_df['description_similarity'].describe()
})
print(summary)

# Show examples of highest and lowest combined similarities
print("\nLowest Combined Brand Similarities:")
print(metrics_df.nsmallest(3, 'brand_similarity')[
    ['BRAND_x', 'BRAND_y', 'brand_similarity']
])

print("\nHighest Combined Brand Similarities:")
print(metrics_df.nlargest(3, 'brand_similarity')[
    ['BRAND_x', 'BRAND_y', 'brand_similarity']
])

Summary of Combined Similarity Scores:
       Brand Similarity  Description Similarity
count        400.000000              400.000000
mean           0.787432                0.727635
std            0.323323                0.162980
min            0.000000                0.192902
25%            0.470176                0.622226
50%            1.000000                0.738975
75%            1.000000                0.846771
max            1.000000                1.000000

Lowest Combined Brand Similarities:
       BRAND_x     BRAND_y  brand_similarity
13   UNBRANDED      Costco               0.0
105  UNBRANDED         Oxo               0.0
221     SUNSIP  Health-Ade               0.0

Highest Combined Brand Similarities:
              BRAND_x             BRAND_y  brand_similarity
1  KIRKLAND SIGNATURE  Kirkland Signature               1.0
2  KIRKLAND SIGNATURE  Kirkland Signature               1.0
3  KIRKLAND SIGNATURE  Kirkland Signature               1.0


In [12]:
metrics_df

Unnamed: 0,ORIGINAL_ITEM_TEXT,FIDO_TYPE,DESCRIPTION,TIER1_NAME,BRAND_x,FIDO,PRODUCT_NUMBER,SAMPLE_STORE,SAMPLE_RECEIPT,ITEM_COUNT,...,EXPANDED_DESCRIPTION,REASONING_SNIPPET,BRAND_x_vs_BRAND_y_sequence,BRAND_x_vs_BRAND_y_levenshtein,BRAND_x_vs_BRAND_y_token,DESCRIPTION_vs_EXPANDED_DESCRIPTION_sequence,DESCRIPTION_vs_EXPANDED_DESCRIPTION_levenshtein,DESCRIPTION_vs_EXPANDED_DESCRIPTION_token,brand_similarity,description_similarity
0,strawberries,PRODUCT,Strawberries,Produce,UNBRANDED,d18956ad-4eaa-408d-9620-26256eed1e4f,27003.0,COSTCO,6984257f-ae79-417b-9407-9d7fa9135954,348691,...,"Premium Strawberries, 2 lb clamshell",The product description 'strawberries' corresp...,0.400000,0.40,0.40,0.500000,0.50,0.51,0.400,0.504000
1,***kswtr40pk,PRODUCT,Kirkland Signature Purified Drinking Water,Beverages,KIRKLAND SIGNATURE,97f8b18c-640d-4105-9a2f-97f726996c5b,782796.0,COSTCO,14094048-0423-4e8e-8fa1-27f78b5823f3,283476,...,"Kirkland Signature Purified Drinking Water, 16...",The original item text '***kswtr40pk' likely a...,1.000000,1.00,1.00,0.800000,0.80,0.82,1.000,0.808000
2,rotisserie,PRODUCT,Kirkland Signature Rotisserie Chicken,Deli & Bakery,KIRKLAND SIGNATURE,175c65d7-e9ce-4108-838c-8148098892ec,87745.0,COSTCO,dd890074-a2d9-4aaa-9f1b-9e5c48717e01,258676,...,Kirkland Signature Rotisserie Chicken,The term 'rotisserie' refers to a cooking meth...,1.000000,1.00,1.00,1.000000,1.00,1.00,1.000,1.000000
3,ks cage free,PRODUCT,Kirkland Signature Large Eggs-cage Free-2 Dozen,Dairy,KIRKLAND SIGNATURE,12a677be-8339-42cc-9c15-495d6a80c0c0,637598.0,COSTCO,e2053422-40ba-4bb4-afc0-be69182020f2,189662,...,"Kirkland Signature Cage-Free Large Eggs, 24 Count",The original item text 'ks cage free' likely s...,1.000000,1.00,1.00,0.666667,0.71,0.93,1.000,0.785000
4,18 ct eggs,PRODUCT,Sauders Eggs Large White 18 Ct,Dairy,SAUDER,5875bf31-3674-4ceb-a4cc-437bc46ec250,1008.0,COSTCO,17d39fed-4a3a-463b-82b6-eac9df1823d1,162319,...,"Kirkland Signature Large Grade A Eggs, 18 Count",The product description '18 ct eggs' refers to...,0.333333,0.33,0.33,0.467532,0.49,0.58,0.331,0.519260
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,yogi og grn antx tea,PRODUCT,Yogi Green Tea Super Antioxidant - 16 Count - ...,Beverages,YOGI,acaab811-5696-4437-a1a7-4917a3eb07e6,,WHOLE FOODS MARKET,ee8d5674-24e8-4672-bfac-4ef49fc67f6f,39,...,Yogi Organic Green Antioxidant Tea,The term 'yogi' directly matches the well-know...,1.000000,1.00,1.00,0.551724,0.55,0.70,1.000,0.610517
396,casc og ccoa crspy rc crl,PRODUCT,Cascadian Farm Organic Cocoa Crispy Rice Cerea...,Pantry,CASCADIAN FARM,8dd8ed2c-c97a-4f6a-bd3d-c04b91089b2c,,WHOLE FOODS MARKET,dccaef62-a4df-47d7-a6e8-add78f90ce9f,39,...,Cascadian Farm Organic Cocoa Crispy Rice Cereal,The original item text 'casc og ccoa crspy rc ...,1.000000,1.00,1.00,0.921569,0.92,0.94,1.000,0.928471
397,glac focus vtwtr 20fz,PRODUCT,Vitaminwater Focus Kiwi Strawberry Flavored Wa...,Beverages,VITAMINWATER,b4848954-a61d-4fe8-89b7-90229a0745ff,,WHOLE FOODS MARKET,7171467f-5ea0-470b-af34-663a745de104,38,...,Glacéau Vitaminwater Focus Kiwi-Strawberry Nut...,The original description 'glac focus vtwtr 20f...,0.750000,0.75,0.77,0.704225,0.70,0.74,0.758,0.717268
398,yogi og purely tea,PRODUCT,Yogi Tea Purely Peppermint Caffeine Free Tea -...,Beverages,YOGI,6fdb80cc-26f0-4c1e-bd5f-00575adfd13b,,WHOLE FOODS MARKET,0fcdf8a9-1339-49ea-801f-a1a815fab21d,37,...,Yogi Organic Purely Peppermint Tea,The original description 'yogi og purely tea' ...,1.000000,1.00,1.00,0.636364,0.64,0.70,1.000,0.662909


In [13]:
# Calculate metrics based on thresholds
brand_threshold = 0.90
desc_threshold = 0.60

brand_metrics = {
    'Total Pairs': len(metrics_df),
    'Matches (>= 0.90)': sum(metrics_df['brand_similarity'] >= brand_threshold),
    'Non-matches (< 0.90)': sum(metrics_df['brand_similarity'] < brand_threshold),
    'Average Similarity': metrics_df['brand_similarity'].mean(),
    'Median Similarity': metrics_df['brand_similarity'].median()
}

desc_metrics = {
    'Total Pairs': len(metrics_df),
    'Matches (>= 0.60)': sum(metrics_df['description_similarity'] >= desc_threshold),
    'Non-matches (< 0.60)': sum(metrics_df['description_similarity'] < desc_threshold),
    'Average Similarity': metrics_df['description_similarity'].mean(),
    'Median Similarity': metrics_df['description_similarity'].median()
}

# Create summary DataFrame
metrics_summary = pd.DataFrame({
    'Brand Metrics': pd.Series(brand_metrics),
    'Description Metrics': pd.Series(desc_metrics)
})

print("Dataset Metrics Summary:")
print(metrics_summary)

# Calculate percentages
print("\nPerformance Percentages:")
print(f"Brand Match Rate: {(brand_metrics['Matches (>= 0.90)'] / brand_metrics['Total Pairs'] * 100):.2f}%")
print(f"Description Match Rate: {(desc_metrics['Matches (>= 0.60)'] / desc_metrics['Total Pairs'] * 100):.2f}%")

# Show examples of mismatches
print("\nExample Brand Mismatches (Similarity < 0.90):")
print(metrics_df[metrics_df['brand_similarity'] < brand_threshold][
    ['BRAND_x', 'BRAND_y', 'brand_similarity']
].head())

print("\nExample Description Mismatches (Similarity < 0.60):")
print(metrics_df[metrics_df['description_similarity'] < desc_threshold][
    ['ORIGINAL_ITEM_TEXT', 'EXPANDED_DESCRIPTION', 'description_similarity']
].head())

Dataset Metrics Summary:
                      Brand Metrics  Description Metrics
Average Similarity         0.787432             0.727635
Matches (>= 0.60)               NaN           309.000000
Matches (>= 0.90)        267.000000                  NaN
Median Similarity          1.000000             0.738975
Non-matches (< 0.60)            NaN            91.000000
Non-matches (< 0.90)     133.000000                  NaN
Total Pairs              400.000000           400.000000

Performance Percentages:
Brand Match Rate: 66.75%
Description Match Rate: 77.25%

Example Brand Mismatches (Similarity < 0.90):
     BRAND_x             BRAND_y  brand_similarity
0  UNBRANDED         Unspecified          0.400000
4     SAUDER  Kirkland Signature          0.331000
6  UNBRANDED      Not Applicable          0.224261
7  UNBRANDED      Not Applicable          0.224261
9  UNBRANDED       Not specified          0.270818

Example Description Mismatches (Similarity < 0.60):
   ORIGINAL_ITEM_TEXT          

In [14]:
metrics_df.to_csv('data/metrics.csv', index=False)