Performs similarity search on selected CSV files using a set of base phrases.

In [9]:
# Helper Functions
from sentence_transformers import SentenceTransformer, util
import numpy as np

model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

def chunk_text(text, chunk_size=7, overlap=2):
    # Split the text into words
    words = text.split()
    
    # List to store the chunks
    chunks = []
    
    # Generate chunks with overlap
    for i in range(0, len(words), chunk_size - overlap):
        chunk = words[i:i + chunk_size]
        if len(chunk) == chunk_size:
            chunks.append(" ".join(chunk))
    
    return chunks

def calculate_max_similarities(base_phrases, target_phrases):
    # Encode the base and target phrases
    base_embeddings = model.encode(base_phrases, convert_to_tensor=True)
    target_embeddings = model.encode(target_phrases, convert_to_tensor=True)

    # Calculate cosine similarities between each target phrase and all base phrases
    similarities = util.pytorch_cos_sim(target_embeddings, base_embeddings)

    # Convert similarities to a NumPy array for easier processing
    similarity_matrix = similarities.cpu().numpy()

    # Get the maximum similarity for each target phrase across all base phrases
    max_similarities = similarity_matrix.max(axis=1)

    return max_similarities



In [4]:
import pandas as pd
df = pd.read_csv("ds.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,std_name,company_name,company_website_link,Made_in_USA_related_info_link,notes,list_order,list_title,list_link,Title_List_Name,...,1997-07,1997-06,1997-05,1997-04,1997-03,1997-02,1997-01,1996-12,1996-11,1996-10
0,4113,throttledownkustoms,Throttle Down Kustoms,https://throttledownkustoms.com,https://throttledownkustoms.com/about/,"American-made, custom steel bumpers, frames, s...",,,,,...,,,,,,,,,,
1,4673,treewalker,Treewalker,https://www.treewalkerllc.com,,Tree Stands Made in the USA,,,,,...,,,,,,,,,,
2,3543,savingshepherd,Saving Shepherd,https://www.savingshepherd.com/,,"Amish Handmade Items, Furniture, Toys, Lighti...",,,,,...,,,,,,,,,,
3,472,bicyclecorporationofamericabca,Bicycle Corporation of America (BCA),https://www.bca.bike,https://www.bca.bike/pages/about-our-factory,,,Best American Bicycle Brands: 101 Manufacturer...,https://discerningcyclist.com/american-bicycle...,,...,,,,,,,,,,
4,1788,hangaronevodka,Hangar One Vodka,https://hangarone.com,,,,The 6 Best American-Made Vodkas,https://www.thrillist.com/spirits/vodka/best-a...,,...,,,,,,,,,,


In [12]:
from rapidfuzz.distance.DamerauLevenshtein import normalized_similarity
from tqdm import tqdm

base_phrases = ["made in the the united states", "proudly american"]

computed_similarities = {}
for i in tqdm(range(df.shape[0])):
    for j in tqdm(range(df.shape[1])):
        text = df.iloc[i, j]
        if not(type(text) == str):
            continue

        if len(text) <= 100:
            continue
    
        already_computed = False
        if text[:5] in computed_similarities:
            for candidate_text in computed_similarities[text[:5]]:
                if normalized_similarity(candidate_text, text) > 90:
                    already_computed = True
                    break

        if already_computed:
            continue

        sub_phrases = chunk_text(text)
        
        if not(text[:5] in computed_similarities):
            computed_similarities[text[:5]] = {}

        computed_similarities[text[:5]][text] = calculate_max_similarities(base_phrases, sub_phrases)

100%|██████████| 339/339 [00:30<00:00, 10.94it/s]
100%|██████████| 339/339 [00:01<00:00, 253.90it/s]
100%|██████████| 339/339 [01:09<00:00,  4.91it/s]
100%|██████████| 339/339 [00:10<00:00, 33.07it/s]
100%|██████████| 339/339 [01:24<00:00,  4.02it/s]
100%|██████████| 339/339 [00:07<00:00, 42.79it/s]
100%|██████████| 339/339 [00:04<00:00, 71.26it/s]
 70%|███████   | 7/10 [03:28<01:14, 24.71s/it]

In [None]:
threshold = 0.7
count_df = df.copy()
