In [1]:
import re
import pandas as pd
from rapidfuzz import fuzz
from sentence_transformers import SentenceTransformer, util

# Changed to index_col=[0] to prevent Unnamed: X columns
df = pd.read_csv("company_website_second_round_with_additional_firms.csv", low_memory=True, index_col=[0])
df = df.drop(columns=[col for col in df.columns if col.startswith('Unnamed:')]) # If th

  from tqdm.autonotebook import tqdm, trange
  df = pd.read_csv("company_website_second_round_with_additional_firms.csv", low_memory=True, index_col=[0])


In [2]:
non_temporal_df = df.iloc[:, 0:13] # Will need to change indexing if more non-temporal columns are added
df = df.drop(df.columns[0:13], axis=1)
assert not any(re.match(r"\d{4}-\d{2}", col) for col in non_temporal_df.columns), "Columns with the pattern 'YYYY-MM' are present in the non-temporal dataframe"
assert all(re.match(r"\d{4}-\d{2}", col) for col in df.columns), "Not all columns follow the 'YYYY-MM' pattern"

In [3]:
def split_df(df, num_dfs=15):
    total_columns = len(df.columns)
    columns_per_df = total_columns // num_dfs
    extra_columns = total_columns % num_dfs
    
    dfs = {}
    start_col = 0
    for i in range(num_dfs):
        end_col = start_col + columns_per_df + (1 if i < extra_columns else 0)
        dfs[f'df_{i}'] = df.iloc[:, start_col:end_col]
        start_col = end_col

    # Ensure no columns are missing
    original_columns = set(df.columns)
    split_columns = set(col for df_split in dfs.values() for col in df_split.columns)
    assert sum(len(dfs[f'df_{i}'].columns) for i in range(num_dfs)) == total_columns, "The total number of columns does not match."
    assert original_columns == split_columns, "Not all columns are accounted for in the split DataFrames."

    return dfs


dfs = split_df(df) # Dict keys: ['df_0', 'df_1', 'df_2', ... 'df_num_dfs']

In [4]:
def compute_hash(string):
    ans = 0
    for i in range(0, min(len(string), 8)):
        ans += ord(string[i]) * (7**i)
    return ans

In [5]:
table = {}

for index, row in enumerate(df.itertuples()):
    if index % 100 == 0:
        print(f"Finished processing {index} out of {len(df.index)} rows.")
    for value in row:
        if pd.isna(value) or type(value) == int:
            continue
        value_hash = compute_hash(value)
        
        if not(value_hash in table):
            table[value_hash] = []

        max_similarity = 0
        for existing_value in table[value_hash]:
            max_similarity = max(max_similarity, fuzz.ratio(existing_value, value))

        if max_similarity < 0.95:
            table[value_hash].append(value)

Finished processing 0 out of 5189 rows.
Finished processing 100 out of 5189 rows.
Finished processing 200 out of 5189 rows.
Finished processing 300 out of 5189 rows.
Finished processing 400 out of 5189 rows.
Finished processing 500 out of 5189 rows.
Finished processing 600 out of 5189 rows.
Finished processing 700 out of 5189 rows.
Finished processing 800 out of 5189 rows.
Finished processing 900 out of 5189 rows.
Finished processing 1000 out of 5189 rows.
Finished processing 1100 out of 5189 rows.
Finished processing 1200 out of 5189 rows.
Finished processing 1300 out of 5189 rows.
Finished processing 1400 out of 5189 rows.
Finished processing 1500 out of 5189 rows.
Finished processing 1600 out of 5189 rows.
Finished processing 1700 out of 5189 rows.
Finished processing 1800 out of 5189 rows.
Finished processing 1900 out of 5189 rows.
Finished processing 2000 out of 5189 rows.
Finished processing 2100 out of 5189 rows.
Finished processing 2200 out of 5189 rows.
Finished processing 230

In [6]:
unique_strings = []
for key in table:
    unique_strings.extend(table[key])
print(len(unique_strings))

14980


In [8]:
sub_corpus = []
full_corpus = ""
for i in range(0, len(unique_strings), 7000):
    max_idx = min(len(unique_strings), i + 7000)
    sub_corpus.append(unique_strings[i:max_idx])
    full_corpus += "\n".join(unique_strings[i:max_idx]) + "\n"

with open("corpus.txt", "w") as f:
    f.write(full_corpus)

with open("corpus_split.json", "w+")  as f:
    sc2 = ["\n".join(sub) for sub in sub_corpus]
    f.write(str(sc2))

In [7]:
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")

stopwords_list = stopwords.words('english')
embeddings_model = SentenceTransformer("all-MiniLM-L6-v2")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/vijayd2/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  return self.fget.__get__(instance, owner)()


In [8]:
def get_phrases(corpus, length=10, split=5):
    total_words = []
    for site in corpus:
        words = [word.lower() for word in site.split()]
        # words = [word for word in words if word not in stopwords_list]
        total_words.extend(words)

    phrases = []
    
    for i in range(0, len(total_words), split):
        right_idx = min(len(total_words), i + length)
        phrases.append(" ".join(total_words[i:right_idx]))
    
    return phrases

In [9]:
print(len(get_phrases(sub_corpus[0], length=20, split=15)))

115874


In [10]:
del df

In [11]:
import heapq

def track_words(original_phrases, embedded_phrases, phrases, max_count=200):
    heap = []
    original_embedded = embeddings_model.encode(original_phrases)
    # embedded_phrases = embeddings_model.encode(phrases)
    print("Finished getting encodings")
    for pidx, embedded_phrase in enumerate(embedded_phrases):
        sim_sum = sum(util.cos_sim(original_embedded, embedded_phrase))
        heapq.heappush(heap, (sim_sum, phrases[pidx]))
        if len(heap) > max_count:
            heapq.heappop(heap)
    return heap

In [12]:
# phrases_for_extraction = {
#     "military": ["supporting our troops", "helping veterans", "first responders", "law enforcement"],
#     "anti-foreign": ["made in China", "low quality foreign imports", "sweap shop"],
#     "jobs": ["American workers", "job creation", "local jobs"],
#     "quality": ["last you a lifetime", "lifetime warranty", "american-made quality", "superior quality"],
#     "labor": ["labor condition and benefits", "sweat shops", "worker benefits", "partnerships with unions"],
#     "revival": ["make america great again", "MAGA", "american manufacturing", "reshoring manufacturing", "boosting america", "america first"]
# }

phrases_for_extraction = {
    "jobs_revised": ["job growth", "hiring from local communities", "boosting local jobs", "hire american"],
    "antiforeign_revised_2": ["non-imported", "low-quality foreign manufacturing", "low-quality outsourcing"],
    "quality_revised": ["american-made quality"],
    "labor_revised": ["labor condition and benefits", "sweat shops", "worker benefits", "partnerships with unions"]
}

In [13]:
subcorpus_embeddings = []
subcorpus_phrases = []
for sc in sub_corpus:
    test_phrases = get_phrases(sc, length=20, split=15)
    extracted_phrases = embeddings_model.encode(test_phrases)
    subcorpus_phrases.append(test_phrases)
    subcorpus_embeddings.append(extracted_phrases)

In [14]:
from openai import OpenAI

model = OpenAI(
    api_key="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"
)

def ask_llm_jobs(phrase):
    messages = [
        {"role": "system", "content": """Classify the following phrase as either: GENERAL, OFFER, or NEITHER. 
        The phrase should be classified as OFFER if it refers to a specific job offer the company is providing. 
        The phrase should be classified as GENERAL if it refers to a general attitude of supporting job growth in the United States.
        The phrase should be classified as NEITHER if it refers to neither or is miscellaneous."""},
        {"role": "user", "content": "Phrase: " + phrase},
    ]
        
    response = model.chat.completions.create(
        model="gpt-3.5-turbo-0125",
        messages=messages,
        max_tokens=100,
        temperature=0.1
    )

    return response.choices[0].message.content, "GENERAL"

def ask_llm_antiforeign(phrase):
    messages = [
        {"role": "system", "content": """Classify the following phrase as either: US, FOREIGN, or NEITHER. 
        The phrase should be classified as US if it refers solely to products being made in the United States. 
        The phrase should be classified as FOREIGN if it refers to goods that are being made abroad (both positively and negatively). 
        FOREIGN phrases can include references to products being manufactured in the United States, but must include a foreign component.
        The phrase should be classified as NEITHER if it doesn't refer to a location of manufacture or is miscellaneous.
        
        Some examples of FOREIGN phrases are:
        1. "our small workshop in pennsylvania. made in china mass produced in a crowded chinese factory without any love or expertise."
        2. "of the 3-4 months typically required when importing from china. small batch customization is a service that is somewhat unique"
        3. "cheap imports, we take pride in creating quality handmade leather goods using full grain leather. our handmade buffalo leather goods"
        4. "corporations continue to rely on manufactures overseas because they can produce a lower cost product by using sweatshop labor (even"
        5. "39 years. in a time when ruthlessly cheap imports flood the market, we strive to make a product that is"

        Some examples of US phrases are:
        1. "quality denim and fabrics from our mill partners in usa, italy and japan a family company based in new york"
            Explanation: This is describing their partner locations in a neutrals tone, not anti foreign. Usually anti foreign is targeting developing countries, or just "foreign" imports in general. So there is a strong discriminatory element there.
        2. "are now made back in france. brand a - z | brand z - a | price low - high"
        3. "our global presence and robust delivery capabilities allow us to efficiently meet region-specific production needs, anywhere in the world. locations" This one is actually saying our global presence is a good thing, so it's the opposite idea.
        4. "we are working diligently to produce and ship quality bb simon products around the world, as efficiently as possible. we" Similar as 3. They are say we sell globally, which is a strength, so not bashing foreign products or bashing globalization.
        5. "internationally. we have a diversified range of innovative and quality products for consumer, plumbing, healthcare and oem/contract markets. you may"
        6. "research and development, manufacturing, and assembly operations in belgium, brazil, china, hungary, india, italy, mexico, the united kingdom, and the"
        """},
        
        {"role": "user", "content": "Phrase: " + phrase},
    ]
        
    response = model.chat.completions.create(
        model="gpt-3.5-turbo-0125",
        messages=messages,
        max_tokens=100,
        temperature=0.1
    )

    return response.choices[0].message.content, "FOREIGN"

def ask_llm_labor_fairness(phrase):
    messages = [
        {"role": "system", "content": """Classify the following phrase as either: FAIRNESS, NOT. 
        The phrase should be classified as FAIRNESS if it describes ideas such as: "we pay a fair wage", "a decent wage to american workers", "they have good work conditions", etc.
        The phrase should be classified as NOT if it does not.
        """},
        
        {"role": "user", "content": "Phrase: " + phrase},
    ]
        
    response = model.chat.completions.create(
        model="gpt-3.5-turbo-0125",
        messages=messages,
        max_tokens=100,
        temperature=0.1
    )

    return response.choices[0].message.content, "FAIRNESS"

def ask_llm_quality(phrase):
    messages = [
        {"role": "system", "content": """Classify the phrase as the following: AMERICAN, NOT.
        The phrase should be classified as AMERICAN if it explicitly states the quality increase from goods being manufactured in the United States.
        The phrase should be classified as NOT if it does not.
        """},
        
        {"role": "user", "content": "Phrase: " + phrase},
    ]
        
    response = model.chat.completions.create(
        model="gpt-3.5-turbo-0125",
        messages=messages,
        max_tokens=100,
        temperature=0.1
    )

    return response.choices[0].message.content, "AMERICAN"

In [16]:
import json

for phrase_key in ["quality_revised"]:
    all_extracted_phrases = []

    for scidx, sc in enumerate(sub_corpus):
        extracted_phrases = track_words(phrases_for_extraction[phrase_key], subcorpus_embeddings[scidx], subcorpus_phrases[scidx])
        for p in extracted_phrases:
            phrase = p[1]
            answer, keyword = ask_llm_quality(phrase)
            print("Phrase: ", phrase, " Answer: ", answer)
            if keyword in answer:
                all_extracted_phrases.append(phrase)
        
        save_file = open(f"{phrase_key}_phrases.json", "w")
        save_file.write(json.dumps(all_extracted_phrases, indent=4))
        save_file.close()
    
    print("NEW KEYWORD: ", phrase_key)
    print(all_extracted_phrases)

Finished getting encodings
Phrase:  made in the usa: top brands reviewed global-params-dmp calphalon: superior cookware, bakeware & appliances core-pageview chat360° size & fit guide  Answer:  NOT
Phrase:  in heirloom quality men’s accessories crafted in america and designed for life. - see more at: http://www.bisonmade.com/#sthash.4zwgvcjj.dpu ____________________________________________________________________________________________________________________________________________ instagram facebook  Answer:  AMERICAN
Phrase:  prescribed temperature 100% crafted in america with highest quality american components eco-friendly designs, crafted with several recycled materials conserves water,  Answer:  AMERICAN
Phrase:  trusted by thousands of americans every day due to our high-quality steel, innovative mechanisms, and industry leading warranty. view products  Answer:  AMERICAN
Phrase:  on: findamericanmadeproducts.com americansworking.com latest tweets copyright body and sole comfort