In [8]:
from gensim.models.keyedvectors import KeyedVectors
from gensim.models import Word2Vec

model = Word2Vec.load("bigram_model.model")
model = model.wv

In [9]:
len(list(model.key_to_index.keys()))

332136

In [10]:
seed_words_antiforeign = [
    "foreign", "outsider", "alien", "overseas", "imported", "external", "foreign-made",
    "unpatriotic", "un-American", "unauthorized", "counterfeit", "offshore"
]

seed_words_fairness = [
    "labor", "unions", "rights", "equality", "justice", "diversity", "inclusion", 
    "fairness", "safety", "wages", "benefits", "respect", "dignity", 
    "opportunity", "representation"
]

seed_words_job_growth = [
    "local", "hire american", "growth", "opportunity", "employment", "workforce", 
    "development", "innovation", "careers", "training", "industries", "jobs", 
    "economy", "expansion", "businesses", "investment", "prosperity", "entrepreneurship", 
    "skilled labor"
]

seed_words_military = [
    "service", "honor", "valor", "duty", "patriotism", 
    "sacrifice", "courage", "mission", "integrity", "loyalty", 
    "freedom", "strength", "security", "leadership", "heroes"
]

seed_words_miu = [
    "domestic", "local", "patriotic", "homegrown", "American-made",
    "regional", "community", "in-house",
    "national", "loyal", "heritage"
]

seed_words_pride =  [
    "american",
    "usa-made",
    "craftsmanship",
    "durability",
    "quality",
    "reliable",
    "precision",
    "handcrafted",
    "engineered",
    "superior",
    "authentic",
    "resilient",
    "premium",
    "trusted",
    "innovative"
]

seed_words_quality = [
    "craftsmanship", "durability", "precision", "excellence", "superior", 
    "integrity", "workmanship", "innovation", "reliability", "tradition", 
    "expertise", "authenticity", "heritage", "quality", "trust"
]

In [11]:
seed_words = {
    "antiforeign": seed_words_antiforeign,
    "fairness": seed_words_fairness,
    "job_growth": seed_words_job_growth,
    "military": seed_words_military,
    "miu": seed_words_miu,
    "pride": seed_words_pride,
    "quality": seed_words_quality
}

In [12]:
from gensim.utils import tokenize
import faiss
import numpy as np
from tqdm import tqdm

index = faiss.IndexFlatL2(384)

corpus = open("corpus_replaced.txt", "r").read()
corpus = list(tokenize(corpus, lowercase=True))

bigrams = []
bigram_embeddings = []
not_added = 0

added_words_1 = {

}

for i in tqdm(range(len(corpus) - 1)):
    if corpus[i] in model and corpus[i + 1] in model:
        if not corpus[i] in added_words_1:
            added_words_1[corpus[i]] = {}
        if not corpus[i + 1] in added_words_1[corpus[i]]:
            bigrams.append((corpus[i], corpus[i + 1]))
            bigram_embeddings.append((model[corpus[i]] + model[corpus[i + 1]]) / 2)
            added_words_1[corpus[i]][corpus[i + 1]] = True
    else:
        # print("Not added: ", corpus[i], corpus[i + 1])
        not_added += 1

index.add(np.array(bigram_embeddings))

print("Bigrams not added to index count: ", not_added)

100%|██████████| 7508235/7508235 [00:11<00:00, 650001.13it/s]


Bigrams not added to index count:  29578


In [13]:
print("Total bigrams added to index: ", len(bigram_embeddings))

Total bigrams added to index:  1916114


In [15]:
import numpy as np
from tqdm import tqdm
from openai import OpenAI
from dotenv import load_dotenv
import os
import json

load_dotenv(".env")

generated_words = ""
generated_words_json = {}

client = OpenAI(
    api_key=os.environ["OPENAI_API_KEY"]
)

filtered_seed_words = {}

for category in tqdm(seed_words):
    filtered_words = [word for word in seed_words[category] if word in model.key_to_index]
    filtered_seed_words[category] = filtered_words
    vectors = [model[word] for word in filtered_words]
    average_vector = np.mean(vectors, axis=0)

    similar_bigrams = index.search(average_vector.reshape(1, -1), 400)
    similar_bigrams = [bigrams[i] for i in similar_bigrams[1][0]]
    similar_bigrams = [f"{bigram[0]} {bigram[1]}" for bigram in similar_bigrams]

    generated_words += f"\n\n\n## {category}:\n"
    generated_words += "\n".join(similar_bigrams)

    response = client.chat.completions.create(
        messages=[{
            "role": "system",
            "content": """Refine the keywords provided and provide a JSON array of the 100 best bigrams generated that are most refined/targeted towards the given category. Response JSON format: {"refined_keywords": ["bigram1", "bigram2", ...]}"""
        }, {
            "role": "user",
            "content": f"# Category: {category}\n\n# Bigrams:\n{json.dumps(similar_bigrams)}"
        }],
        model="gpt-4o",
        response_format={"type": "json_object"}
    )
    response = response.choices[0].message.content
    response = json.loads(response)
    refined_bigrams = response["refined_keywords"]

    removal = []
    for word_1 in refined_bigrams:
        for word_2 in refined_bigrams:
            if word_1 in word_2 and word_1 != word_2:
                removal.append(word_2)

    refined_bigrams = [word for word in refined_bigrams if word not in removal]

    generated_words += f"\n\n\n## {category} Refined Bigrams:\n"
    generated_words += "\n".join(refined_bigrams)

    generated_words_json[category] = refined_bigrams

100%|██████████| 7/7 [00:51<00:00,  7.32s/it]


In [16]:
generated_words = "# Seed words dictionary \n" + json.dumps(filtered_seed_words, indent=4) + generated_words

In [17]:
with open("generated_words.txt", "w") as f:
    f.write(generated_words)

In [18]:
with open("generated_words.json", "w") as f:
    json.dump(generated_words_json, f)

In [None]:
# Find 100 words close to this averaged vector.


In [None]:
import json

words = [
    "craftsmanship", "excellence", "longevity", "reliability", "integrity", 
    "innovation", "durability", "superior", "dedication", "unmatched", 
    "exceptional", "expertise", "precision", "quality", "refinement", 
    "standards", "authenticity", "workmanship", "dependability", "tradition", 
    "unparalleled", "strength", "commitment", "consistency", "remarkable", 
    "legacy", "elegance", "perfection", "knowledge", "toughness", 
    "professionalism", "skill", "performance", "capability", "dedicated", 
    "refined", "attention", "unwavering", "extraordinary", "industry", 
    "artistry", "enduring", "resilience", "highest", "specialized", 
    "innovations", "providing", "engineers", "pursuit", "detail", 
    "technology", "masterful", "timeless", "professionals", "finest", 
    "skilled", "artistic", "authentic", "heritage", "materials", 
    "rigorous", "honesty", "vision", "assurance", "greatest", 
    "strong", "engineered", "creating", "sustainable", "growth", 
    "fabrication", "scientists", "driven", "craft", "culture", 
    "flawless", "exquisite", "dependable", "pride", "creativity", 
    "meticulous", "achieving", "groundbreaking", "development", 
    "reliable", "expectations", "experienced", "perseverance", 
    "environment", "construction", "sophistication", "pioneering", 
    "rugged", "comfort", "philosophy", "confidence", "clarity", 
    "endurance", "distinctive", "leadership", "distinct", "integration"
]
words = sorted(list(set(words)))

removal = []
for word_1 in words:
    for word_2 in words:
        if word_1 in word_2 and word_1 != word_2:
            removal.append(word_2)

print("Len before: ", len(words))

words = [word for word in words if word not in removal]
print(json.dumps(words, indent=4))
print(len(words), f"removed {len(removal)} words")
print(removal)

Len before:  102
[
    "achieving",
    "artistic",
    "artistry",
    "assurance",
    "attention",
    "authentic",
    "capability",
    "clarity",
    "comfort",
    "commitment",
    "confidence",
    "consistency",
    "construction",
    "craft",
    "creating",
    "creativity",
    "culture",
    "dedicated",
    "dedication",
    "dependability",
    "dependable",
    "detail",
    "development",
    "distinct",
    "driven",
    "durability",
    "elegance",
    "endurance",
    "enduring",
    "engineered",
    "engineers",
    "environment",
    "excellence",
    "exceptional",
    "expectations",
    "experienced",
    "expertise",
    "exquisite",
    "extraordinary",
    "fabrication",
    "finest",
    "flawless",
    "greatest",
    "groundbreaking",
    "growth",
    "heritage",
    "highest",
    "honesty",
    "industry",
    "innovation",
    "integration",
    "integrity",
    "knowledge",
    "leadership",
    "legacy",
    "longevity",
    "masterful",
    "ma