In [1]:
import pandas as pd
import numpy as np


In [35]:
metrics_df = pd.read_csv('data/merged_results.csv')
metrics_df.head()

Unnamed: 0,ORIGINAL_ITEM_TEXT,FIDO_TYPE,DESCRIPTION,TIER1_NAME,BRAND_x,FIDO,PRODUCT_NUMBER,SAMPLE_STORE,SAMPLE_RECEIPT,ITEM_COUNT,MEDIAN(PAM.FINAL_PRICE),GMV,BRAND_y,BRAND_SCORE,CATEGORY,CATEGORY_SCORE,EXPANDED_DESCRIPTION,REASONING_SNIPPET
0,strawberries,PRODUCT,Strawberries,Produce,UNBRANDED,d18956ad-4eaa-408d-9620-26256eed1e4f,27003,COSTCO,6984257f-ae79-417b-9407-9d7fa9135954,348691,4.49,1565622.59,Unbranded,low,Produce,high,"Premium Strawberries, 2 lb clamshell",The product description 'strawberries' refers ...
1,***kswtr40pk,PRODUCT,Kirkland Signature Purified Drinking Water,Beverages,KIRKLAND SIGNATURE,97f8b18c-640d-4105-9a2f-97f726996c5b,782796,COSTCO,14094048-0423-4e8e-8fa1-27f78b5823f3,283476,3.99,1131069.24,Kirkland Signature,high,Beverages,high,"Kirkland Signature Purified Drinking Water, 16...",The original item text '***kswtr40pk' likely a...
2,rotisserie,PRODUCT,Kirkland Signature Rotisserie Chicken,Deli & Bakery,KIRKLAND SIGNATURE,175c65d7-e9ce-4108-838c-8148098892ec,87745,COSTCO,dd890074-a2d9-4aaa-9f1b-9e5c48717e01,258676,4.99,1290793.24,Kirkland Signature,high,Meat & Seafood,high,Kirkland Signature Rotisserie Chicken,The term 'rotisserie' refers to a cooking meth...
3,ks cage free,PRODUCT,Kirkland Signature Large Eggs-cage Free-2 Dozen,Dairy,KIRKLAND SIGNATURE,12a677be-8339-42cc-9c15-495d6a80c0c0,637598,COSTCO,e2053422-40ba-4bb4-afc0-be69182020f2,189662,6.89,1306771.18,Kirkland Signature,high,Dairy,high,"Kirkland Signature Cage-Free Large Eggs, 24 Count",The abbreviation 'ks' corresponds to 'Kirkland...
4,18 ct eggs,PRODUCT,Sauders Eggs Large White 18 Ct,Dairy,SAUDER,5875bf31-3674-4ceb-a4cc-437bc46ec250,1008,COSTCO,17d39fed-4a3a-463b-82b6-eac9df1823d1,162319,5.19,842435.61,Kirkland Signature,high,Dairy,high,"Kirkland Signature Large Grade A Eggs, 18 Count",The product description '18 ct eggs' refers to...


In [36]:
# Import required libraries
from difflib import SequenceMatcher
from thefuzz import fuzz
import pandas as pd
import numpy as np

def calculate_similarity_metrics(row, col1, col2):
    """Calculate multiple similarity metrics between two strings"""
    str1 = str(row[col1]).lower()
    str2 = str(row[col2]).lower()
    
    # Ratios from different algorithms
    sequence_ratio = SequenceMatcher(None, str1, str2).ratio()
    levenshtein_ratio = fuzz.ratio(str1, str2) / 100
    token_sort_ratio = fuzz.token_sort_ratio(str1, str2) / 100
    
    return pd.Series({
        f'{col1}_vs_{col2}_sequence': sequence_ratio,
        f'{col1}_vs_{col2}_levenshtein': levenshtein_ratio,
        f'{col1}_vs_{col2}_token': token_sort_ratio
    })

# Calculate similarity metrics for both pairs of columns
brand_similarities = metrics_df.apply(
    calculate_similarity_metrics, 
    args=('BRAND_x', 'BRAND_y'), 
    axis=1
)

description_similarities = metrics_df.apply(
    calculate_similarity_metrics, 
    args=('DESCRIPTION', 'EXPANDED_DESCRIPTION'), 
    axis=1
)

category_similarities = metrics_df.apply(
    calculate_similarity_metrics, 
    args=('TIER1_NAME', 'CATEGORY'), 
    axis=1
)

# Add similarity scores to the original dataframe
metrics_df = pd.concat([metrics_df, brand_similarities, description_similarities, category_similarities], axis=1)


In [37]:

# Calculate summary statistics
summary_stats = pd.DataFrame({
    'Brand Comparisons': {
        'Mean Sequence Similarity': metrics_df['BRAND_x_vs_BRAND_y_sequence'].mean(),
        'Mean Levenshtein Similarity': metrics_df['BRAND_x_vs_BRAND_y_levenshtein'].mean(),
        'Mean Token Similarity': metrics_df['BRAND_x_vs_BRAND_y_token'].mean()
    },
    'Description Comparisons': {
        'Mean Sequence Similarity': metrics_df['DESCRIPTION_vs_EXPANDED_DESCRIPTION_sequence'].mean(),
        'Mean Levenshtein Similarity': metrics_df['DESCRIPTION_vs_EXPANDED_DESCRIPTION_levenshtein'].mean(),
        'Mean Token Similarity': metrics_df['DESCRIPTION_vs_EXPANDED_DESCRIPTION_token'].mean()
    }
})

# Display summary statistics
print("Similarity Score Summary:")
print(summary_stats)

# Display examples of high and low similarity scores
print("\nExamples of Extreme Cases:")
print("\nLowest Brand Similarity:")
lowest_brand = metrics_df.nsmallest(3, 'BRAND_x_vs_BRAND_y_token')[
    ['BRAND_x', 'BRAND_y', 'BRAND_x_vs_BRAND_y_token']
]
print(lowest_brand)

print("\nHighest Brand Similarity:")
highest_brand = metrics_df.nlargest(3, 'BRAND_x_vs_BRAND_y_token')[
    ['BRAND_x', 'BRAND_y', 'BRAND_x_vs_BRAND_y_token']
]
print(highest_brand)

Similarity Score Summary:
                             Brand Comparisons  Description Comparisons
Mean Sequence Similarity              0.781574                 0.677782
Mean Levenshtein Similarity           0.784660                 0.686320
Mean Token Similarity                 0.789730                 0.735220

Examples of Extreme Cases:

Lowest Brand Similarity:
       BRAND_x BRAND_y  BRAND_x_vs_BRAND_y_token
13   UNBRANDED  Costco                       0.0
112  UNBRANDED  Costco                       0.0
524     SUNSIP    Häde                       0.0

Highest Brand Similarity:
              BRAND_x             BRAND_y  BRAND_x_vs_BRAND_y_token
0           UNBRANDED           Unbranded                       1.0
1  KIRKLAND SIGNATURE  Kirkland Signature                       1.0
2  KIRKLAND SIGNATURE  Kirkland Signature                       1.0


In [38]:
# Calculate weighted average similarity scores
metrics_df['brand_similarity'] = (
    metrics_df['BRAND_x_vs_BRAND_y_sequence'] * 0.3 +
    metrics_df['BRAND_x_vs_BRAND_y_levenshtein'] * 0.3 +
    metrics_df['BRAND_x_vs_BRAND_y_token'] * 0.4
)

metrics_df['description_similarity'] = (
    metrics_df['DESCRIPTION_vs_EXPANDED_DESCRIPTION_sequence'] * 0.3 +
    metrics_df['DESCRIPTION_vs_EXPANDED_DESCRIPTION_levenshtein'] * 0.3 +
    metrics_df['DESCRIPTION_vs_EXPANDED_DESCRIPTION_token'] * 0.4
)

metrics_df['category_similarity'] = (
    metrics_df['TIER1_NAME_vs_CATEGORY_sequence'] * 0.3 +
    metrics_df['TIER1_NAME_vs_CATEGORY_levenshtein'] * 0.3 +
    metrics_df['TIER1_NAME_vs_CATEGORY_token'] * 0.4
)

# Update summary to include category metrics
summary = pd.DataFrame({
    'Brand Similarity': metrics_df['brand_similarity'].describe(),
    'Description Similarity': metrics_df['description_similarity'].describe(),
    'Category Similarity': metrics_df['category_similarity'].describe()
})

print("Summary of Combined Similarity Scores:")
print(summary)

# Show examples of category similarities
print("\nLowest Category Similarities:")
print(metrics_df.nsmallest(3, 'category_similarity')[
    ['TIER1_NAME', 'CATEGORY', 'category_similarity']
])

print("\nHighest Category Similarities:")
print(metrics_df.nlargest(3, 'category_similarity')[
    ['TIER1_NAME', 'CATEGORY', 'category_similarity']
])

Summary of Combined Similarity Scores:
       Brand Similarity  Description Similarity  Category Similarity
count       1000.000000             1000.000000          1000.000000
mean           0.785762                0.703319             0.855946
std            0.327016                0.183533             0.300063
min            0.000000                0.059647             0.000000
25%            0.500000                0.575800             1.000000
50%            1.000000                0.717308             1.000000
75%            1.000000                0.842171             1.000000
max            1.000000                1.000000             1.000000

Lowest Category Similarities:
    TIER1_NAME            CATEGORY  category_similarity
41      Pantry                Fees                  0.0
964        NaN             Produce                  0.0
991        NaN  Household Supplies                  0.0

Highest Category Similarities:
  TIER1_NAME   CATEGORY  category_similarity
0    Pro

In [39]:
metrics_df

Unnamed: 0,ORIGINAL_ITEM_TEXT,FIDO_TYPE,DESCRIPTION,TIER1_NAME,BRAND_x,FIDO,PRODUCT_NUMBER,SAMPLE_STORE,SAMPLE_RECEIPT,ITEM_COUNT,...,BRAND_x_vs_BRAND_y_token,DESCRIPTION_vs_EXPANDED_DESCRIPTION_sequence,DESCRIPTION_vs_EXPANDED_DESCRIPTION_levenshtein,DESCRIPTION_vs_EXPANDED_DESCRIPTION_token,TIER1_NAME_vs_CATEGORY_sequence,TIER1_NAME_vs_CATEGORY_levenshtein,TIER1_NAME_vs_CATEGORY_token,brand_similarity,description_similarity,category_similarity
0,strawberries,PRODUCT,Strawberries,Produce,UNBRANDED,d18956ad-4eaa-408d-9620-26256eed1e4f,27003,COSTCO,6984257f-ae79-417b-9407-9d7fa9135954,348691,...,1.00,0.500000,0.50,0.51,1.000000,1.00,1.00,1.000000,0.504000,1.000000
1,***kswtr40pk,PRODUCT,Kirkland Signature Purified Drinking Water,Beverages,KIRKLAND SIGNATURE,97f8b18c-640d-4105-9a2f-97f726996c5b,782796,COSTCO,14094048-0423-4e8e-8fa1-27f78b5823f3,283476,...,1.00,0.792453,0.79,0.81,1.000000,1.00,1.00,1.000000,0.798736,1.000000
2,rotisserie,PRODUCT,Kirkland Signature Rotisserie Chicken,Deli & Bakery,KIRKLAND SIGNATURE,175c65d7-e9ce-4108-838c-8148098892ec,87745,COSTCO,dd890074-a2d9-4aaa-9f1b-9e5c48717e01,258676,...,1.00,1.000000,1.00,1.00,0.370370,0.37,0.26,1.000000,1.000000,0.326111
3,ks cage free,PRODUCT,Kirkland Signature Large Eggs-cage Free-2 Dozen,Dairy,KIRKLAND SIGNATURE,12a677be-8339-42cc-9c15-495d6a80c0c0,637598,COSTCO,e2053422-40ba-4bb4-afc0-be69182020f2,189662,...,1.00,0.666667,0.71,0.93,1.000000,1.00,1.00,1.000000,0.785000,1.000000
4,18 ct eggs,PRODUCT,Sauders Eggs Large White 18 Ct,Dairy,SAUDER,5875bf31-3674-4ceb-a4cc-437bc46ec250,1008,COSTCO,17d39fed-4a3a-463b-82b6-eac9df1823d1,162319,...,0.33,0.467532,0.49,0.58,1.000000,1.00,1.00,0.331000,0.519260,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,gv pink salt,PRODUCT,Great Value Fine Himalayan Pink Salt - 17.6 Oz...,Pantry,GREAT VALUE,78104f3e-6c0c-454e-ad14-364a67ffef73,078742368740,WHOLE FOODS MARKET,c88e12a4-1029-429e-a563-93cd057594ab,2,...,1.00,0.727273,0.73,0.84,1.000000,1.00,1.00,1.000000,0.773182,1.000000
996,fncy dlght turk chpc,PRODUCT,Fancy Feast Grilled Turkey And Cheddar Cheese ...,Animals & Pet Supplies,FANCY FEAST,c3b610b7-8bef-41db-b8fe-1df7713b43ba,,WHOLE FOODS MARKET,a864a0d6-43e1-4f11-b3a8-5aa9915b8afe,2,...,0.28,0.413043,0.41,0.38,0.285714,0.29,0.32,0.279333,0.398913,0.300714
997,yellow onions,PRODUCT,YELLOW ONIONS,Produce,UNBRANDED,6c71a46b-d8a1-431c-acf5-35c69d3bdd3b,,WHOLE FOODS MARKET,c35ed3db-f236-44d8-bc54-ffcf7fd7399c,2,...,0.18,0.500000,0.50,0.50,1.000000,1.00,1.00,0.178941,0.500000,1.000000
998,bonipak iceberg lettuce 1,PRODUCT,Iceberg Lettuce,Produce,UNBRANDED,8c1a19f0-18d3-4981-9c86-75291c7fc0b5,0003338365020,WHOLE FOODS MARKET,97799793-5e24-4932-81fc-82f23b9a3b94,2,...,0.25,0.652174,0.65,0.65,1.000000,1.00,1.00,0.250000,0.650652,1.000000


In [40]:
# Calculate metrics based on thresholds
brand_threshold = 0.90
desc_threshold = 0.60

brand_metrics = {
    'Total Pairs': len(metrics_df),
    'Matches (>= 0.90)': sum(metrics_df['brand_similarity'] >= brand_threshold),
    'Non-matches (< 0.90)': sum(metrics_df['brand_similarity'] < brand_threshold),
    'Average Similarity': metrics_df['brand_similarity'].mean(),
    'Median Similarity': metrics_df['brand_similarity'].median()
}

desc_metrics = {
    'Total Pairs': len(metrics_df),
    'Matches (>= 0.60)': sum(metrics_df['description_similarity'] >= desc_threshold),
    'Non-matches (< 0.60)': sum(metrics_df['description_similarity'] < desc_threshold),
    'Average Similarity': metrics_df['description_similarity'].mean(),
    'Median Similarity': metrics_df['description_similarity'].median()
}

# Create summary DataFrame
metrics_summary = pd.DataFrame({
    'Brand Metrics': pd.Series(brand_metrics),
    'Description Metrics': pd.Series(desc_metrics)
})

print("Dataset Metrics Summary:")
print(metrics_summary)

# Calculate percentages
print("\nPerformance Percentages:")
print(f"Brand Match Rate: {(brand_metrics['Matches (>= 0.90)'] / brand_metrics['Total Pairs'] * 100):.2f}%")
print(f"Description Match Rate: {(desc_metrics['Matches (>= 0.60)'] / desc_metrics['Total Pairs'] * 100):.2f}%")

# Show examples of mismatches
print("\nExample Brand Mismatches (Similarity < 0.90):")
print(metrics_df[metrics_df['brand_similarity'] < brand_threshold][
    ['BRAND_x', 'BRAND_y', 'brand_similarity']
].head())

print("\nExample Description Mismatches (Similarity < 0.60):")
print(metrics_df[metrics_df['description_similarity'] < desc_threshold][
    ['ORIGINAL_ITEM_TEXT', 'EXPANDED_DESCRIPTION', 'description_similarity']
].head())

Dataset Metrics Summary:
                      Brand Metrics  Description Metrics
Average Similarity         0.785762             0.703319
Matches (>= 0.60)               NaN           702.000000
Matches (>= 0.90)        662.000000                  NaN
Median Similarity          1.000000             0.717308
Non-matches (< 0.60)            NaN           298.000000
Non-matches (< 0.90)     338.000000                  NaN
Total Pairs             1000.000000          1000.000000

Performance Percentages:
Brand Match Rate: 66.20%
Description Match Rate: 70.20%

Example Brand Mismatches (Similarity < 0.90):
      BRAND_x             BRAND_y  brand_similarity
4      SAUDER  Kirkland Signature          0.331000
6   UNBRANDED      Not Applicable          0.224261
7   UNBRANDED       Not specified          0.270818
9   UNBRANDED  Kirkland Signature          0.370111
13  UNBRANDED              Costco          0.000000

Example Description Mismatches (Similarity < 0.60):
   ORIGINAL_ITEM_TEXT    

In [41]:
metrics_df.to_csv('data/metrics.csv', index=False)

In [42]:
# Calculate average similarity scores
avg_scores = {
    'Brand Similarity': metrics_df['brand_similarity'].mean(),
    'Description Similarity': metrics_df['description_similarity'].mean(),
    'Category Similarity': metrics_df['category_similarity'].mean()
}

# Calculate percentage of perfect scores (1.0)
perfect_scores = {
    'Brand Perfect Match %': (metrics_df['brand_similarity'] == 1.0).mean() * 100,
    'Description Perfect Match %': (metrics_df['description_similarity'] == 1.0).mean() * 100,
    'Category Perfect Match %': (metrics_df['category_similarity'] == 1.0).mean() * 100
}

# Create summary DataFrame
summary_df = pd.DataFrame({
    'Average Score': avg_scores,
    'Perfect Match %': perfect_scores
}).round(2)

print("Similarity Metrics Summary:")
print(summary_df)

# Show distribution of scores near perfect (optional)
print("\nDistribution of Near-Perfect Scores (>= 0.95):")
near_perfect = {
    'Brand': (metrics_df['brand_similarity'] >= 0.95).mean() * 100,
    'Description': (metrics_df['description_similarity'] >= 0.95).mean() * 100,
    'Category': (metrics_df['category_similarity'] >= 0.95).mean() * 100
}
for metric, pct in near_perfect.items():
    print(f"{metric}: {pct:.2f}% of scores >= 0.95")

Similarity Metrics Summary:
                             Average Score  Perfect Match %
Brand Similarity                      0.79              NaN
Description Similarity                0.70              NaN
Category Similarity                   0.86              NaN
Brand Perfect Match %                  NaN             63.3
Description Perfect Match %            NaN              3.4
Category Perfect Match %               NaN             81.0

Distribution of Near-Perfect Scores (>= 0.95):
Brand: 65.30% of scores >= 0.95
Description: 10.40% of scores >= 0.95
Category: 81.00% of scores >= 0.95
