Performs similarity search on selected CSV files using a set of base phrases.

In [9]:
# Helper Functions
from sentence_transformers import SentenceTransformer, util
import numpy as np

model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

def chunk_text(text, chunk_size=7, overlap=2):
    # Split the text into words
    words = text.split()
    
    # List to store the chunks
    chunks = []
    
    # Generate chunks with overlap
    for i in range(0, len(words), chunk_size - overlap):
        chunk = words[i:i + chunk_size]
        if len(chunk) == chunk_size:
            chunks.append(" ".join(chunk))
    
    return chunks

def calculate_max_similarities(base_phrases, target_phrases):
    # Encode the base and target phrases
    base_embeddings = model.encode(base_phrases, convert_to_tensor=True)
    target_embeddings = model.encode(target_phrases, convert_to_tensor=True)

    # Calculate cosine similarities between each target phrase and all base phrases
    similarities = util.pytorch_cos_sim(target_embeddings, base_embeddings)

    # Convert similarities to a NumPy array for easier processing
    similarity_matrix = similarities.cpu().numpy()

    # Get the maximum similarity for each target phrase across all base phrases
    max_similarities = similarity_matrix.max(axis=1)

    return max_similarities



In [4]:
import pandas as pd
df = pd.read_csv("ds.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,std_name,company_name,company_website_link,Made_in_USA_related_info_link,notes,list_order,list_title,list_link,Title_List_Name,...,1997-07,1997-06,1997-05,1997-04,1997-03,1997-02,1997-01,1996-12,1996-11,1996-10
0,4113,throttledownkustoms,Throttle Down Kustoms,https://throttledownkustoms.com,https://throttledownkustoms.com/about/,"American-made, custom steel bumpers, frames, s...",,,,,...,,,,,,,,,,
1,4673,treewalker,Treewalker,https://www.treewalkerllc.com,,Tree Stands Made in the USA,,,,,...,,,,,,,,,,
2,3543,savingshepherd,Saving Shepherd,https://www.savingshepherd.com/,,"Amish Handmade Items, Furniture, Toys, Lighti...",,,,,...,,,,,,,,,,
3,472,bicyclecorporationofamericabca,Bicycle Corporation of America (BCA),https://www.bca.bike,https://www.bca.bike/pages/about-our-factory,,,Best American Bicycle Brands: 101 Manufacturer...,https://discerningcyclist.com/american-bicycle...,,...,,,,,,,,,,
4,1788,hangaronevodka,Hangar One Vodka,https://hangarone.com,,,,The 6 Best American-Made Vodkas,https://www.thrillist.com/spirits/vodka/best-a...,,...,,,,,,,,,,


In [12]:
from rapidfuzz.distance.DamerauLevenshtein import normalized_similarity
from tqdm import tqdm

base_phrases = ["made in the the united states", "proudly american"]

computed_similarities = {}
for i in tqdm(range(df.shape[0])):
    for j in tqdm(range(df.shape[1])):
        text = df.iloc[i, j]
        if not(type(text) == str):
            continue

        if len(text) <= 100:
            continue
    
        already_computed = False
        if text[:5] in computed_similarities:
            for candidate_text in computed_similarities[text[:15]]:
                if normalized_similarity(candidate_text, text) > 90:
                    already_computed = True
                    break

        if already_computed:
            continue

        sub_phrases = chunk_text(text)
        
        if not(text[:5] in computed_similarities):
            computed_similarities[text[:5]] = {}

        computed_similarities[text[:5]][text] = calculate_max_similarities(base_phrases, sub_phrases)

100%|██████████| 339/339 [00:30<00:00, 10.94it/s]
100%|██████████| 339/339 [00:01<00:00, 253.90it/s]
100%|██████████| 339/339 [01:09<00:00,  4.91it/s]
100%|██████████| 339/339 [00:10<00:00, 33.07it/s]
100%|██████████| 339/339 [01:24<00:00,  4.02it/s]
100%|██████████| 339/339 [00:07<00:00, 42.79it/s]
100%|██████████| 339/339 [00:04<00:00, 71.26it/s]
100%|██████████| 339/339 [10:57<00:00,  1.94s/it]
100%|██████████| 339/339 [42:40<00:00,  7.55s/it]
100%|██████████| 339/339 [07:21<00:00,  1.30s/it]
100%|██████████| 10/10 [1:04:28<00:00, 386.80s/it]


In [None]:
import pickle
with open("phrase_similarity_scores.pickle", 'wb') as handle:
    pickle.dump(computed_similarities, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
computed_similaries = pickle.load(open("phrase_similarity_scores.pickle", "rb"))

In [24]:
threshold = 0.5
count_df = df.copy()
print(count_df.shape)

start_col_idx = 10

for i in tqdm(range(df.shape[0])):
    for j in tqdm(range(df.shape[1])):
        text = df.iloc[i, j]
        if not(type(text) == str):
            count_df.iloc[i, j] = 0
            continue

        if len(text) <= 100:
            count_df.iloc[i, j] = 0
    
        if text[:5] in computed_similarities:
            for candidate_text in computed_similarities[text[:5]]:
                if normalized_similarity(candidate_text, text) > 90:
                    count = 0
                    for i in computed_similarities[text[:5]][candidate_text]:
                        if i > threshold:
                            count += 1
                    count_df.iloc[i, j] = count
                    break

print("Finished inputting the counts")

# Show the results/growth in terminology over time
for i in tqdm(range(df.shape[0])):
    max_value_seen = 0
    for j in tqdm(range(10, df.shape[1])):
        if count_df.iloc[i, j]:
            max_value_seen = max(max_value_seen, count_df[i][j])
        count_df.iloc[i, j] = max_value_seen

print("Calculated the counts fully")

import matplotlib.pyplot as plt
import pandas as pd

# Ignore the first 10 columns (metadata)
count_df = count_df.iloc[:, 10:]

# Plot each company (row) as an overlapping line chart
plt.figure(figsize=(10, 6))

for idx in count_df.index:
    plt.plot(count_df.columns, count_df.loc[idx], label=f'Company {idx}')

# Add labels and title
plt.title(f'Company Made In America/Proudly American Mentions Over Time w/ Threshold {threshold}')
plt.xlabel('Time')
plt.ylabel('Values')

# Show the legend
plt.legend()

# Display the plot
plt.show()

(10, 339)


100%|██████████| 339/339 [00:32<00:00, 10.52it/s]
100%|██████████| 339/339 [00:00<00:00, 1324.50it/s]
100%|██████████| 339/339 [01:37<00:00,  3.49it/s]
100%|██████████| 339/339 [00:07<00:00, 43.11it/s]
100%|██████████| 339/339 [01:41<00:00,  3.34it/s]
100%|██████████| 339/339 [00:04<00:00, 81.69it/s]
100%|██████████| 339/339 [00:04<00:00, 83.94it/s]
100%|██████████| 339/339 [18:50<00:00,  3.33s/it] 
 35%|███▌      | 120/339 [42:37<1:17:47, 21.31s/it]
 80%|████████  | 8/10 [1:05:34<16:23, 491.81s/it]


KeyboardInterrupt: 