In [1]:
import os
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
from openai import OpenAI
from dotenv import load_dotenv
import re

load_dotenv()
client = OpenAI(api_key=os.environ["OPENROUTER_API_KEY"], base_url="https://openrouter.ai/api/v1")

nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
website_df = pd.read_csv("website_data.csv")

website_df.head()

[nltk_data] Downloading package stopwords to /Users/vijay/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  website_df = pd.read_csv("website_data.csv")


Unnamed: 0.1,Unnamed: 0,std_name,company_name,company_website_link,Made_in_USA_related_info_link,notes,list_order,list_title,list_link,Title_List_Name,...,1997-07,1997-06,1997-05,1997-04,1997-03,1997-02,1997-01,1996-12,1996-11,1996-10
0,0,11industries,11 Industries,https://11industries.com/,,"Wallets, Dopp Kits, Bags | ""All our products ...",,,,,...,,,,,,,,,,
1,1,11wellsspirits,11 Wells Spirits,https://www.11wells.com,,,194.0,American Rum Producers,https://www.robsrum.com/americanrum/,,...,,,,,,,,,,
2,2,14throse,14th Rose,https://14throse.com/,,"Women?s Purses, Totes | ""14th Rose Purses are ...",,,,,...,,,,,,,,,,
3,3,1919cookware,1919 Cookware,https://www.regalware.com,,,373.0,,https://usamadeproducts.biz/appliances-kitchen...,USA Made Cookware List | 29 Brands & Manufactu...,...,,,,,,,"\nWelcome to Regal Ware, Inc.\nA global leader...","\nWelcome to Regal Ware, Inc.\nA global leader...",,
4,4,20degeesbelow,20 Degees below,https://20below.net,,,352.0,,https://americansworking.com/clothing.html,Clothing - americans-working,...,,,,,,,,,,


In [None]:

def propagate_str_value_forward(df, min_length=1):
    df = df.copy()
    # Replace empty strings with NaN
    df.replace('', np.nan, inplace=True)  
    # Replace strings with length less than min_length with NaN
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].apply(lambda x: np.nan if isinstance(x, str) and len(x) < min_length else x)
    # Forward fill NaN values
    df = df.T.ffill(axis=0).T  
    # Fill remaining NaN with empty strings
    df.fillna('', inplace=True)  
    return df

def remove_stopwords(text):
    if isinstance(text, str):
        return ' '.join([word for word in text.split() if word.lower() not in stop_words])
    return text

selected_cols = reversed([col for col in website_df.columns if col.startswith("1") or col.startswith("2")])
website_df = website_df[selected_cols]
website_df = propagate_str_value_forward(website_df)

for col in tqdm(website_df.select_dtypes(include=['object']).columns):
    website_df[col] = website_df[col].apply(remove_stopwords)

In [4]:
def classify_text_with_openai(text):
    prompt = f"Classify the text into the likelist SIC code for the business. If you're unsure, match it to the closest SIC code from the segment of text you see. \n\nText: {text[:1000]}\n\n SIC Code:"
    
    try:
        response = client.chat.completions.create(
            model="google/gemma-2-9b-it",
            messages=[
                {"role": "system", "content": "You are a SIC classification assistant."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=30,
            temperature=0.3
        )
        
        classification = response.choices[0].message.content.strip()

        sic_code_match = re.search(r'\b\d{4}\b', classification)
        if sic_code_match:
            classification = sic_code_match.group(0)
        else:
            # print("Couldn't match SIC code in: ", classification, text[:1000], len(text))
            digits = ''.join(re.findall(r'\d+', classification))
            return digits
            # if len(digits) >= 4:
            #     classification = digits[:4]
            # else:
            #     classification = "0000"

        return classification
    except Exception as e:
        print(f"Error processing text: {e}")
        return "0000"  # Default for errors

In [5]:
website_df.iloc[:, -1]

0       Gifts Men | 11 Industries | Gentleman's Genera...
1       Distillery | 11 Wells Distillery | Saint Paul ...
2       14th Rose - Handbags, Purses, Woman's Bags Fre...
3       Home - Regal Ware Work Core Values Story Leade...
4       Custom Swim Parkas Wholesale Fleece Jackets | ...
                              ...                        
5184                                                   ��
5185    Home - Xuron Corporation, original inventor Mi...
5186    Yellow 108 – Yellow 108 | Sustainable Headwear...
5187    Cast Touring - Standard Alpine Touring Freerid...
5188    handmade ultra light backpacking gear – YAR.ge...
Name: 2023-10, Length: 5189, dtype: object

In [6]:
import concurrent.futures
import os
import json
from tqdm import tqdm
import threading
import time

# Function to process a single row
def process_row(row_data):
    i, row = row_data
    text = row[-1]
    classification = classify_text_with_openai(text)
    return i, classification

# Load existing classifications
website_classifications = []
if os.path.exists("website_classifications.json"):
    with open("website_classifications.json", "r") as f:
        website_classifications = json.load(f)

# Initialize or prepare for processing
if not website_classifications or len(website_classifications) != len(website_df):
    # If we need to start fresh or sizes don't match
    if len(website_classifications) != len(website_df):
        website_classifications = [None] * len(website_df)
    
    # Determine which rows need processing
    tasks = []
    for i, row in enumerate(website_df.itertuples(index=False)):
        if i >= len(website_classifications) or website_classifications[i] is None:
            tasks.append((i, row))
    
    # Set up saving mechanism
    save_lock = threading.Lock()
    last_save_time = time.time()
    
    # Number of concurrent workers - adjust based on API rate limits
    max_workers = 5
    
    # Process in parallel
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit all tasks
        futures = {executor.submit(process_row, task): task[0] for task in tasks}
        
        # Process results as they complete
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(tasks)):
            idx = futures[future]
            try:
                i, classification = future.result()
                website_classifications[i] = classification
                
                # Save periodically (every 20 seconds)
                current_time = time.time()
                if current_time - last_save_time > 20:
                    with save_lock:
                        with open("website_classifications.json", "w") as f:
                            json.dump(website_classifications, f)
                    last_save_time = current_time
                    
            except Exception as exc:
                print(f'Error processing item {idx}: {exc}')
    
    # Final save
    with open("website_classifications.json", "w") as f:
        json.dump(website_classifications, f)

website_df["category"] = website_classifications
website_df.to_csv("website_data_with_classifications.csv", index=False)

 12%|█▏        | 633/5189 [02:35<23:19,  3.25it/s]

Error processing text: list index out of range


 17%|█▋        | 896/5189 [03:42<23:16,  3.07it/s]

Error processing text: list index out of range


 17%|█▋        | 906/5189 [03:46<32:34,  2.19it/s]

Error processing text: list index out of range


 18%|█▊        | 914/5189 [03:50<29:41,  2.40it/s]

Error processing text: list index out of range


 18%|█▊        | 915/5189 [03:51<38:47,  1.84it/s]

Error processing text: list index out of range


 18%|█▊        | 925/5189 [03:56<30:21,  2.34it/s]

Error processing text: list index out of range


 18%|█▊        | 932/5189 [03:58<32:10,  2.21it/s]

Error processing text: list index out of range


 18%|█▊        | 936/5189 [04:00<31:48,  2.23it/s]

Error processing text: list index out of range


 18%|█▊        | 937/5189 [04:01<37:45,  1.88it/s]

Error processing text: list index out of range


 18%|█▊        | 939/5189 [04:01<28:59,  2.44it/s]

Error processing text: list index out of range


 18%|█▊        | 940/5189 [04:02<36:08,  1.96it/s]

Error processing text: list index out of range


 18%|█▊        | 944/5189 [04:04<31:32,  2.24it/s]

Error processing text: list index out of range


 18%|█▊        | 950/5189 [04:09<59:24,  1.19it/s]

Error processing text: list index out of range


 18%|█▊        | 952/5189 [04:12<1:25:22,  1.21s/it]

Error processing text: list index out of range


 18%|█▊        | 953/5189 [04:13<1:25:55,  1.22s/it]

Error processing text: list index out of range


 18%|█▊        | 958/5189 [04:15<42:00,  1.68it/s]  

Error processing text: list index out of range


 19%|█▊        | 969/5189 [04:19<28:26,  2.47it/s]

Error processing text: list index out of range


 19%|█▉        | 984/5189 [04:25<23:05,  3.04it/s]

Error processing text: list index out of range


 26%|██▌       | 1337/5189 [05:58<10:18,  6.23it/s]

Error processing text: list index out of range


 32%|███▏      | 1647/5189 [07:13<21:40,  2.72it/s]

Error processing text: list index out of range


 47%|████▋     | 2418/5189 [10:21<10:39,  4.33it/s]

Error processing text: list index out of range


 49%|████▊     | 2523/5189 [10:47<12:15,  3.62it/s]

Error processing text: list index out of range


 50%|█████     | 2598/5189 [11:03<07:39,  5.64it/s]

Error processing text: list index out of range


 52%|█████▏    | 2721/5189 [11:34<10:28,  3.93it/s]

Error processing text: list index out of range


 52%|█████▏    | 2724/5189 [11:34<06:51,  6.00it/s]

Error processing text: list index out of range


 53%|█████▎    | 2752/5189 [11:41<10:12,  3.98it/s]

Error processing text: list index out of range


 53%|█████▎    | 2762/5189 [11:44<09:34,  4.23it/s]

Error processing text: list index out of range
Error processing text: list index out of range


 54%|█████▍    | 2792/5189 [11:51<09:18,  4.29it/s]

Error processing text: list index out of range


 54%|█████▍    | 2804/5189 [11:55<12:17,  3.23it/s]

Error processing text: list index out of range
Error processing text: list index out of range


 55%|█████▍    | 2834/5189 [12:01<08:57,  4.38it/s]

Error processing text: list index out of range


 55%|█████▍    | 2837/5189 [12:02<10:43,  3.66it/s]

Error processing text: list index out of range


 55%|█████▌    | 2854/5189 [12:06<05:35,  6.97it/s]

Error processing text: list index out of range


 55%|█████▌    | 2869/5189 [12:10<08:10,  4.73it/s]

Error processing text: list index out of range
Error processing text: list index out of range


 58%|█████▊    | 3022/5189 [12:46<08:06,  4.46it/s]

Error processing text: list index out of range


 60%|██████    | 3114/5189 [13:08<06:11,  5.58it/s]

Error processing text: list index out of range
Error processing text: list index out of range


 60%|██████    | 3135/5189 [13:13<06:34,  5.21it/s]

Error processing text: list index out of range


 61%|██████    | 3144/5189 [13:15<07:20,  4.64it/s]

Error processing text: list index out of range


 62%|██████▏   | 3214/5189 [13:30<07:53,  4.17it/s]

Error processing text: list index out of range


 64%|██████▎   | 3304/5189 [13:53<06:10,  5.09it/s]

Error processing text: list index out of range


 66%|██████▌   | 3436/5189 [14:25<08:36,  3.39it/s]

Error processing text: list index out of range


 67%|██████▋   | 3457/5189 [14:30<07:21,  3.93it/s]

Error processing text: list index out of range


 67%|██████▋   | 3491/5189 [14:38<05:45,  4.92it/s]

Error processing text: list index out of range


 69%|██████▉   | 3586/5189 [15:03<07:27,  3.59it/s]

Error processing text: list index out of range


 69%|██████▉   | 3598/5189 [15:06<05:57,  4.46it/s]

Error processing text: list index out of range


 70%|██████▉   | 3614/5189 [15:09<05:04,  5.17it/s]

Error processing text: list index out of range


 71%|███████▏  | 3708/5189 [15:31<04:46,  5.18it/s]

Error processing text: list index out of range


 72%|███████▏  | 3734/5189 [15:37<05:32,  4.37it/s]

Error processing text: list index out of range


 72%|███████▏  | 3744/5189 [15:40<06:12,  3.88it/s]

Error processing text: list index out of range


 73%|███████▎  | 3780/5189 [15:49<05:07,  4.58it/s]

Error processing text: list index out of range


 73%|███████▎  | 3797/5189 [15:54<04:12,  5.51it/s]

Error processing text: list index out of range
Error processing text: list index out of range


 73%|███████▎  | 3802/5189 [15:55<03:56,  5.87it/s]

Error processing text: list index out of range


 76%|███████▌  | 3922/5189 [16:21<04:05,  5.16it/s]

Error processing text: list index out of range


 76%|███████▌  | 3931/5189 [16:24<04:59,  4.21it/s]

Error processing text: list index out of range


 78%|███████▊  | 4059/5189 [16:56<04:39,  4.04it/s]

Error processing text: list index out of range


 78%|███████▊  | 4061/5189 [16:57<04:29,  4.19it/s]

Error processing text: list index out of range


 80%|███████▉  | 4150/5189 [17:19<03:42,  4.68it/s]

Error processing text: list index out of range


 80%|████████  | 4163/5189 [17:22<03:20,  5.11it/s]

Error processing text: list index out of range


 82%|████████▏ | 4240/5189 [17:42<03:12,  4.92it/s]

Error processing text: list index out of range


 82%|████████▏ | 4248/5189 [17:44<03:51,  4.07it/s]

Error processing text: list index out of range


 84%|████████▎ | 4334/5189 [18:06<03:26,  4.15it/s]

Error processing text: list index out of range


 84%|████████▎ | 4336/5189 [18:06<02:53,  4.92it/s]

Error processing text: list index out of range


 84%|████████▍ | 4374/5189 [18:16<03:28,  3.91it/s]

Error processing text: list index out of range


 85%|████████▍ | 4390/5189 [18:20<03:29,  3.82it/s]

Error processing text: list index out of range


 85%|████████▌ | 4427/5189 [18:29<02:16,  5.57it/s]

Error processing text: list index out of range


 86%|████████▌ | 4448/5189 [18:35<02:30,  4.92it/s]

Error processing text: list index out of range


 86%|████████▌ | 4449/5189 [18:35<02:48,  4.39it/s]

Error processing text: list index out of range


 86%|████████▋ | 4484/5189 [18:44<03:42,  3.16it/s]

Error processing text: list index out of range


 87%|████████▋ | 4497/5189 [18:47<02:25,  4.76it/s]

Error processing text: list index out of range


 88%|████████▊ | 4569/5189 [19:06<03:02,  3.39it/s]

Error processing text: list index out of range


 88%|████████▊ | 4584/5189 [19:09<02:27,  4.11it/s]

Error processing text: list index out of range


 88%|████████▊ | 4591/5189 [19:11<02:07,  4.68it/s]

Error processing text: list index out of range


 88%|████████▊ | 4592/5189 [19:12<03:05,  3.21it/s]

Error processing text: list index out of range


 89%|████████▊ | 4600/5189 [19:13<02:03,  4.78it/s]

Error processing text: list index out of range


 89%|████████▉ | 4622/5189 [19:19<02:41,  3.52it/s]

Error processing text: list index out of range


 89%|████████▉ | 4626/5189 [19:20<02:00,  4.67it/s]

Error processing text: list index out of range


 90%|████████▉ | 4645/5189 [19:24<02:03,  4.39it/s]

Error processing text: list index out of range


 90%|████████▉ | 4650/5189 [19:26<02:08,  4.18it/s]

Error processing text: list index out of range


 90%|████████▉ | 4660/5189 [19:28<02:35,  3.40it/s]

Error processing text: list index out of range


 91%|█████████ | 4724/5189 [19:44<01:51,  4.16it/s]

Error processing text: list index out of range


 91%|█████████▏| 4740/5189 [19:48<01:24,  5.30it/s]

Error processing text: list index out of range


 91%|█████████▏| 4742/5189 [19:49<01:25,  5.20it/s]

Error processing text: list index out of range


 91%|█████████▏| 4746/5189 [19:50<01:42,  4.31it/s]

Error processing text: list index out of range


 92%|█████████▏| 4749/5189 [19:51<01:41,  4.32it/s]

Error processing text: list index out of range


 92%|█████████▏| 4755/5189 [19:52<01:30,  4.79it/s]

Error processing text: list index out of range


 93%|█████████▎| 4805/5189 [20:05<01:25,  4.48it/s]

Error processing text: list index out of range


 93%|█████████▎| 4808/5189 [20:06<01:41,  3.77it/s]

Error processing text: list index out of range


 93%|█████████▎| 4825/5189 [20:10<01:35,  3.82it/s]

Error processing text: list index out of range


 94%|█████████▎| 4863/5189 [20:19<01:14,  4.38it/s]

Error processing text: list index out of range


 94%|█████████▍| 4866/5189 [20:20<01:26,  3.74it/s]

Error processing text: list index out of range


 94%|█████████▍| 4876/5189 [20:23<01:18,  3.96it/s]

Error processing text: list index out of range


 94%|█████████▍| 4901/5189 [20:29<00:41,  6.90it/s]

Error processing text: list index out of range


 95%|█████████▍| 4906/5189 [20:30<00:56,  5.02it/s]

Error processing text: list index out of range


 95%|█████████▍| 4914/5189 [20:32<00:55,  4.92it/s]

Error processing text: list index out of range


 95%|█████████▍| 4919/5189 [20:33<01:07,  4.00it/s]

Error processing text: list index out of range


 95%|█████████▌| 4949/5189 [20:39<00:50,  4.75it/s]

Error processing text: list index out of range


 95%|█████████▌| 4952/5189 [20:40<01:04,  3.67it/s]

Error processing text: list index out of range


 95%|█████████▌| 4953/5189 [20:41<01:22,  2.86it/s]

Error processing text: list index out of range


 96%|█████████▌| 4976/5189 [20:45<00:36,  5.86it/s]

Error processing text: list index out of range


 96%|█████████▌| 4994/5189 [20:50<00:45,  4.32it/s]

Error processing text: list index out of range


 96%|█████████▋| 5001/5189 [20:52<00:37,  4.95it/s]

Error processing text: list index out of range


 97%|█████████▋| 5024/5189 [20:58<00:39,  4.13it/s]

Error processing text: list index out of range
Error processing text: list index out of range


 98%|█████████▊| 5082/5189 [21:11<00:17,  6.28it/s]

Error processing text: list index out of range


 98%|█████████▊| 5097/5189 [21:15<00:19,  4.68it/s]

Error processing text: list index out of range


100%|█████████▉| 5164/5189 [21:30<00:04,  5.89it/s]

Error processing text: list index out of range


100%|██████████| 5189/5189 [21:36<00:00,  4.00it/s]
  website_df["category"] = website_classifications
