In [116]:
#imports
import pandas as pd
import nltk
import numpy as np
import string
import re
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [200]:
companies_path = ("/content/drive/MyDrive/veridion/ml_insurance_challenge.csv")
taxonomy_path = ("/content/drive/MyDrive/veridion/insurance_taxonomy.csv")

companies = pd.read_csv(companies_path)
taxonomy = pd.read_csv(taxonomy_path)
len(companies)

9494

In [201]:
def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    tokens = list(set(tokens))
    return ' '.join(tokens)

def tokenize(text):
    return clean_text(text).split()

In [5]:
!pip install sentence_transformers



In [172]:
import torch
import torch.nn.functional as F
from sentence_transformers import SentenceTransformer

In [202]:
def combine_fields_without_desc(business_tags, sector, category, niche):
    """
    Combine the provided fields into a single string.
    Adjust the format or weights as needed.
    """
    if isinstance(business_tags, list):
        business_tags = " ".join(business_tags)
    combined = f"{business_tags} {sector} {category} {niche}"
    return combined

In [203]:
flagged_indexes = []
threshold = 0.45
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device='cuda')

for idx, row in companies.iterrows():
    description = str(row['description'])
    business_tags = row['business_tags']
    sector = str(row['sector'])
    category = str(row['category'])
    niche = str(row['niche'])

    combined_text = combine_fields_without_desc(business_tags, sector, category, niche)

    cleaned_description = clean_text(description)
    cleaned_combined = clean_text(combined_text)

    with torch.no_grad():
        desc_embedding = model.encode(cleaned_description, convert_to_tensor=True)
        combined_embedding = model.encode(cleaned_combined, convert_to_tensor=True)

    cos_sim = F.cosine_similarity(desc_embedding, combined_embedding, dim=0).item()

    if cos_sim < threshold:
        flagged_indexes.append(idx)

companies_filtered = companies.drop(flagged_indexes)
remaining_idx = companies.index.difference(companies_filtered.index)
companies_extra = companies.loc[remaining_idx].copy()

print("Number of entries removed as noisy:", len(flagged_indexes))
print("Remaining companies:", len(companies_filtered))

Number of entries removed as noisy: 2109
Remaining companies: 7385


In [125]:
flagged_indexes

[1,
 7,
 14,
 17,
 22,
 26,
 28,
 35,
 38,
 49,
 54,
 58,
 59,
 61,
 65,
 74,
 80,
 88,
 91,
 92,
 93,
 94,
 99,
 104,
 106,
 114,
 119,
 121,
 122,
 126,
 127,
 134,
 159,
 163,
 173,
 174,
 175,
 177,
 178,
 179,
 180,
 181,
 182,
 184,
 196,
 201,
 205,
 209,
 212,
 213,
 215,
 218,
 224,
 234,
 235,
 240,
 243,
 246,
 249,
 250,
 256,
 257,
 260,
 265,
 269,
 273,
 279,
 282,
 287,
 291,
 293,
 295,
 297,
 302,
 303,
 306,
 307,
 308,
 318,
 326,
 334,
 342,
 343,
 346,
 347,
 355,
 362,
 365,
 369,
 372,
 376,
 377,
 384,
 390,
 391,
 403,
 404,
 407,
 410,
 412,
 423,
 426,
 427,
 438,
 444,
 447,
 450,
 452,
 455,
 467,
 470,
 471,
 472,
 479,
 485,
 489,
 499,
 501,
 505,
 506,
 509,
 519,
 521,
 522,
 524,
 550,
 565,
 567,
 570,
 571,
 572,
 574,
 575,
 581,
 582,
 584,
 585,
 587,
 590,
 599,
 608,
 611,
 613,
 628,
 639,
 640,
 641,
 660,
 661,
 664,
 668,
 670,
 678,
 685,
 690,
 691,
 703,
 704,
 705,
 706,
 711,
 716,
 719,
 720,
 723,
 727,
 732,
 738,
 740,
 754,
 757,

In [204]:
companies_filtered.to_csv('filtered_companies.csv', index=False)

In [205]:
companies_filtered.head()

Unnamed: 0,description,business_tags,sector,category,niche
0,Welchcivils is a civil engineering and constru...,"['Construction Services', 'Multi-utilities', '...",Services,Civil Engineering Services,Other Heavy and Civil Engineering Construction
2,Loidholdhof Integrative Hofgemeinschaft is a c...,"['Living Forms', 'Farm Cafe', 'Fresh Coffee', ...",Manufacturing,Farms & Agriculture Production,All Other Miscellaneous Crop Farming
3,PATAGONIA Chapa Y Pintura is an auto body shop...,"['Automotive Body Repair Services', 'Interior ...",Services,Auto Body Shops,"Automotive Body, Paint, and Interior Repair an..."
4,Stanica WODNA PTTK Swornegacie is a cultural e...,"['Cultural Activities', 'Accommodation Service...",Services,Boat Tours & Cruises,"Scenic and Sightseeing Transportation, Water"
5,BIQ Benefícios is a Brazilian company that spe...,"['Healthy Food Options', 'Accredited Establish...",Services,Payment Processing & Point of Sale,"Financial Transactions Processing, Reserve, an..."


In [206]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=device)

In [207]:
def combine_fields(description, business_tags, sector, category, niche):
    """
    Combine the provided fields into a single string.
    Adjust the format or weights as needed.
    """
    if isinstance(business_tags, list):
        business_tags = " ".join(business_tags)
    combined = f"{description} {business_tags} {sector} {category} {niche}"
    return combined

In [208]:
def clean_label(txt):
    to_remove = ["services", "management", "installation", "construction"]

    txt = " ".join([word for word in txt.split() if word not in to_remove])
    return txt

In [209]:
taxonomy_labels = taxonomy['label'].tolist()
cleaned_taxonomy_labels = [clean_label(label) for label in taxonomy_labels]

with torch.no_grad():
    label_embeddings = model.encode(cleaned_taxonomy_labels, convert_to_tensor=True)
    label_embeddings = F.normalize(label_embeddings, p=2, dim=1)

def classify_texts(texts):
    with torch.no_grad():
        text_embeddings = model.encode(texts, convert_to_tensor=True)
        text_embeddings = F.normalize(text_embeddings, p=2, dim=1)

        similarity = torch.matmul(text_embeddings, label_embeddings.T)

        probabilities = F.softmax(similarity, dim=1)

    return probabilities.cpu().numpy()


#texts = companies_filtered['description'].tolist()
sector = companies_filtered['sector'].tolist()
category = companies_filtered['category'].tolist()
niche = companies_filtered['niche'].tolist()
all_texts = [combine_fields_without_desc(business_tags, sector, category, niche) for business_tags, sector, category, niche in zip(companies_filtered['business_tags'], companies_filtered['sector'], companies_filtered['category'], companies_filtered['niche'])]
cleaned_texts = [clean_text(text) for text in all_texts]

results = classify_texts(cleaned_texts)
#for res in results:
#  tags = [label for label, score in res]
#  predicted_tags.append(tags)

#companies_filtered['predicted_tags'] = predicted_tags

In [210]:
len(results)

7385

In [211]:
len(taxonomy)

220

In [212]:
companies_filtered = companies_filtered.reset_index(drop=True)

companies_filtered['labels'] = ""

for i in range(len(companies_filtered)):
    if i < len(results):
        top3_indices = np.argsort(results[i])[-3:][::-1]
        top3_labels = [taxonomy_labels[j] for j in top3_indices]
        companies_filtered.at[i, 'labels'] = top3_labels
    else:
        companies_filtered.at[i, 'labels'] = []

companies_filtered.head()

Unnamed: 0,description,business_tags,sector,category,niche,labels
0,Welchcivils is a civil engineering and constru...,"['Construction Services', 'Multi-utilities', '...",Services,Civil Engineering Services,Other Heavy and Civil Engineering Construction,[Commercial Communication Equipment Installati...
1,Loidholdhof Integrative Hofgemeinschaft is a c...,"['Living Forms', 'Farm Cafe', 'Fresh Coffee', ...",Manufacturing,Farms & Agriculture Production,All Other Miscellaneous Crop Farming,"[Bakery Production Services, Gardening Service..."
2,PATAGONIA Chapa Y Pintura is an auto body shop...,"['Automotive Body Repair Services', 'Interior ...",Services,Auto Body Shops,"Automotive Body, Paint, and Interior Repair an...","[Interior Design Services, Painting Services, ..."
3,Stanica WODNA PTTK Swornegacie is a cultural e...,"['Cultural Activities', 'Accommodation Service...",Services,Boat Tours & Cruises,"Scenic and Sightseeing Transportation, Water","[Travel Services, Catering Services, Commercia..."
4,BIQ Benefícios is a Brazilian company that spe...,"['Healthy Food Options', 'Accredited Establish...",Services,Payment Processing & Point of Sale,"Financial Transactions Processing, Reserve, an...","[Food Processing Services, Health Promotion Se..."


In [213]:
companies_filtered.to_csv('filtered_companies_tagged.csv', index=False)

In [218]:
companies_filtered = companies_filtered.copy()
companies_filtered['combined_text'] = companies_filtered.apply(lambda row: combine_fields(
    row['description'], row['business_tags'], row['sector'], row['category'], row['niche']
), axis=1)
companies_filtered['combined_text'] = companies_filtered['combined_text'].apply(clean_text)


In [219]:
from collections import Counter
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score
import random

In [220]:
def tokenize(text):
    return clean_text(text).split()

def build_vocab(texts, min_freq=1):
    counter = Counter()
    for text in texts:
        tokens = tokenize(text)
        counter.update(tokens)
    vocab = {token: idx+2 for idx, (token, count) in enumerate(counter.items()) if count >= min_freq}
    vocab['<PAD>'] = 0
    vocab['<UNK>'] = 1
    return vocab

vocab = build_vocab(companies_filtered['combined_text'].tolist(), min_freq=1)

def text_to_indices(text, vocab):
    tokens = tokenize(text)
    return [vocab.get(token, vocab['<UNK>']) for token in tokens]


In [221]:
def synonym_replacement(text, n):
    words = text.split()
    new_words = words.copy()
    random_word_list = list(set([word for word in words if word not in stopwords.words('english')]))
    random.shuffle(random_word_list)
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)
        if len(synonyms) >= 1:
            synonym = random.choice(list(synonyms))
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= n:
            break
    sentence = ' '.join(new_words)
    return sentence


def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for l in syn.lemmas():
            synonym = l.name().replace("_", " ").replace("-", " ").lower()
            synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
            synonyms.add(synonym)
    if word in synonyms:
        synonyms.remove(word)
    return list(synonyms)

def random_deletion(text, p):
    words = text.split()
    if len(words) == 1:
        return text
    remaining_words = [word for word in words if random.uniform(0, 1) > p]
    if len(remaining_words) == 0:
        remaining_words.append(random.choice(words))
    return ' '.join(remaining_words)

In [217]:
empty_rows = companies_filtered[companies_filtered['labels'] == '']
print(empty_rows)

Empty DataFrame
Columns: [description, business_tags, sector, category, niche, labels]
Index: []


In [248]:
unique_labels = list(set([label for sublist in companies_filtered['labels'].tolist() for label in sublist]))
label2idx = {label: idx for idx, label in enumerate(unique_labels)}

In [249]:
train_df, temp_df = train_test_split(companies_filtered, test_size=20, random_state=42)
val_df, test_df = train_test_split(companies_filtered, test_size=0.5, random_state=42)

In [252]:
class CompanyDataset(Dataset):
    def __init__(self, texts, labels, vocab, label2idx):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.label2idx = label2idx

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        indices = text_to_indices(text, self.vocab)
        if random.random() < 0.5:
            text = synonym_replacement(text, n=1)
        label = self.label2idx[self.labels[idx][0]]
        return torch.tensor(indices, dtype=torch.long), torch.tensor(label, dtype=torch.long)

def collate_fn(batch):
    texts, labels = zip(*batch)
    lengths = [len(text) for text in texts]
    max_len = max(lengths)
    # Pad sequences to the same length
    padded_texts = [torch.cat([text, torch.tensor([vocab['<PAD>']] * (max_len - len(text)), dtype=torch.long)])
                    for text in texts]
    padded_texts = torch.stack(padded_texts)
    labels = torch.stack(labels)
    return padded_texts, labels, lengths

dataset = CompanyDataset(
    train_df['combined_text'].tolist(),
    train_df['labels'].tolist(),
    vocab,
    label2idx
)
test_dataset = CompanyDataset(
       test_df['combined_text'].tolist(),
       test_df['labels'].tolist(),
       vocab,
       label2idx
   )
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)


In [253]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers=1, dropout=0.5):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=vocab['<PAD>'])
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers,
                            batch_first=True, dropout=dropout if num_layers > 1 else 0)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x, lengths):
        embedded = self.embedding(x)
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, lengths, batch_first=True, enforce_sorted=False)
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        hidden = hidden[-1]
        dropped = self.dropout(hidden)
        output = self.fc(dropped)
        return output

vocab_size = len(vocab)
embedding_dim = 100
hidden_dim = 128
output_dim = len(unique_labels)
num_layers = 1
dropout = 0.5

model = LSTMClassifier(vocab_size, embedding_dim, hidden_dim, output_dim, num_layers, dropout)


In [254]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [255]:
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)
num_epochs = 40

for epoch in range(num_epochs):
    epoch_loss = 0
    model.train()
    for texts, labels, lengths in dataloader:
        texts = texts.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(texts, lengths)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    avg_loss = epoch_loss / len(dataloader)
    print(f"Epoch {epoch+1}/{num_epochs} - Train Loss: {avg_loss:.4f}")
    model.eval()
    test_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for texts, labels, lengths in test_dataloader:
            texts = texts.to(device)
            labels = labels.to(device)
            outputs = model(texts, lengths)
            loss = criterion(outputs, labels)
            test_loss += loss.item()
            preds = torch.argmax(outputs, dim=1)
            all_preds.append(preds.cpu().numpy())
            all_labels.append(labels.cpu().numpy())

    avg_test_loss = test_loss / len(test_dataloader)
    all_preds = np.concatenate(all_preds, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)
    test_acc = accuracy_score(all_labels, all_preds)
    print(f"Epoch {epoch+1}/{num_epochs} - Test Loss: {avg_test_loss:.4f}, Test Accuracy: {test_acc:.4f}")

print("Training complete.")

torch.save(model.state_dict(), 'lstm_classifier.pt')


Epoch 1/40 - Train Loss: 4.9581
Epoch 1/40 - Test Loss: 4.6257, Test Accuracy: 0.0829
Epoch 2/40 - Train Loss: 4.5351
Epoch 2/40 - Test Loss: 4.2655, Test Accuracy: 0.1286
Epoch 3/40 - Train Loss: 4.2194
Epoch 3/40 - Test Loss: 3.8832, Test Accuracy: 0.1671
Epoch 4/40 - Train Loss: 3.9475
Epoch 4/40 - Test Loss: 3.5610, Test Accuracy: 0.2321
Epoch 5/40 - Train Loss: 3.6736
Epoch 5/40 - Test Loss: 3.2528, Test Accuracy: 0.2564
Epoch 6/40 - Train Loss: 3.3833
Epoch 6/40 - Test Loss: 3.0387, Test Accuracy: 0.2981
Epoch 7/40 - Train Loss: 3.1353
Epoch 7/40 - Test Loss: 2.7097, Test Accuracy: 0.3753
Epoch 8/40 - Train Loss: 2.8830
Epoch 8/40 - Test Loss: 2.4153, Test Accuracy: 0.4311
Epoch 9/40 - Train Loss: 2.6230
Epoch 9/40 - Test Loss: 2.1858, Test Accuracy: 0.4649
Epoch 10/40 - Train Loss: 2.4041
Epoch 10/40 - Test Loss: 1.9360, Test Accuracy: 0.5380
Epoch 11/40 - Train Loss: 2.1559
Epoch 11/40 - Test Loss: 1.7006, Test Accuracy: 0.5846
Epoch 12/40 - Train Loss: 1.9280
Epoch 12/40 - Tes

In [262]:
val_dataset = CompanyDataset(
    val_df['combined_text'].tolist(),
    val_df['labels'].tolist(),
    vocab,
    label2idx
)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)
id2label = {idx: label for label, idx in label2idx.items()}

In [265]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np

def evaluate_model(model, dataloader):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for texts, labels, lengths in dataloader:
            texts = texts.to(device)
            labels = labels.to(device)
            outputs = model(texts, lengths)
            preds = torch.argmax(outputs, dim=1)
            all_preds.append(preds.cpu().numpy())
            all_labels.append(labels.cpu().numpy())

    all_preds = np.concatenate(all_preds, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)

    acc = accuracy_score(all_labels, all_preds)

    unique_test_labels = np.union1d(all_labels, all_preds)
    target_names = [id2label[i] for i in unique_test_labels]

    report = classification_report(
       all_labels,
       all_preds,
       labels=unique_test_labels,
       target_names=target_names,
       zero_division=0
    )

    conf_matrix = confusion_matrix(all_labels, all_preds)

    print("Accuracy: {:.4f}".format(acc))
    print("\nClassification Report:\n", report)
    print("Confusion Matrix:\n", conf_matrix)

evaluate_model(model, val_dataloader)

Accuracy: 0.9962

Classification Report:
                                                   precision    recall  f1-score   support

                         Furniture Manufacturing       1.00      1.00      1.00        67
                   Residential Plumbing Services       1.00      1.00      1.00         5
                       Grain Processing Services       1.00      1.00      1.00         1
                   Business Development Services       1.00      1.00      1.00        15
                            Food Safety Services       1.00      1.00      1.00         9
                     Alarm Installation Services       1.00      1.00      1.00        16
                      Coffee Processing Services       1.00      1.00      1.00        13
           Grain Handling Machinery Installation       1.00      1.00      1.00         4
                           Apparel Manufacturing       1.00      1.00      1.00        53
                         Ink Production Services       1.

In [268]:
companies_filtered.head()

Unnamed: 0,description,business_tags,sector,category,niche,labels,combined_text,predicted_label_index,confidence,predicted_label
0,Welchcivils is a civil engineering and constru...,"['Construction Services', 'Multi-utilities', '...",Services,Civil Engineering Services,Other Heavy and Civil Engineering Construction,[Commercial Communication Equipment Installati...,usage maximize capable fibre residential optic...,39,1.0,Commercial Communication Equipment Installation
1,Loidholdhof Integrative Hofgemeinschaft is a c...,"['Living Forms', 'Farm Cafe', 'Fresh Coffee', ...",Manufacturing,Farms & Agriculture Production,All Other Miscellaneous Crop Farming,"[Bakery Production Services, Gardening Service...",communityoriented basis focus agriculture home...,94,1.0,Bakery Production Services
2,PATAGONIA Chapa Y Pintura is an auto body shop...,"['Automotive Body Repair Services', 'Interior ...",Services,Auto Body Shops,"Automotive Body, Paint, and Interior Repair an...","[Interior Design Services, Painting Services, ...",body interior located argentina pintura comodo...,201,1.0,Interior Design Services
3,Stanica WODNA PTTK Swornegacie is a cultural e...,"['Cultural Activities', 'Accommodation Service...",Services,Boat Tours & Cruises,"Scenic and Sightseeing Transportation, Water","[Travel Services, Catering Services, Commercia...",rentals destination small levels transportatio...,150,1.0,Travel Services
4,BIQ Benefícios is a Brazilian company that spe...,"['Healthy Food Options', 'Accredited Establish...",Services,Payment Processing & Point of Sale,"Financial Transactions Processing, Reserve, an...","[Food Processing Services, Health Promotion Se...",lasts balance end generating generation financ...,167,1.0,Food Processing Services


In [280]:
def fill_predictions(row):
    if isinstance(row['labels'], list) and len(row['labels']) > 0:
        true_label = row['labels'][0]
        return pd.Series({
            'confidence': 1.0,
        })
    return pd.Series({
        'confidence': np.nan,
    })

companies_filtered['confidence'] = companies_filtered.apply(fill_predictions, axis=1)
companies_extra['combined_text'] = companies_extra.apply(
    lambda row: combine_fields(row['description'], row['business_tags'], row['sector'], row['category'], row['niche']),
    axis=1
)
companies_extra['combined_text'] = companies_extra['combined_text'].apply(clean_text)

pseudo_labels = []
confidence_scores = []
model.eval()
for idx, row in companies_extra.iterrows():
    text = row['combined_text']
    indices = text_to_indices(text, vocab)
    input_tensor = torch.tensor(indices, dtype=torch.long).unsqueeze(0).to(device)
    lengths = [len(indices)]
    with torch.no_grad():
        output = model(input_tensor, lengths)
        probs = F.softmax(output, dim=1)
        max_prob, pred_class = torch.max(probs, dim=1)
    pseudo_labels.append(pred_class.item())
    confidence_scores.append(max_prob.item())

companies_extra['labels'] = pseudo_labels
companies_extra['confidence'] = confidence_scores

combined_df = pd.concat([companies_filtered, companies_extra], ignore_index=True)

print("Combined DataFrame shape:", combined_df.shape)
combined_df.head()


Combined DataFrame shape: (9494, 8)


Unnamed: 0,description,business_tags,sector,category,niche,labels,confidence,combined_text
0,Welchcivils is a civil engineering and constru...,"['Construction Services', 'Multi-utilities', '...",Services,Civil Engineering Services,Other Heavy and Civil Engineering Construction,[Commercial Communication Equipment Installati...,1.0,
1,Loidholdhof Integrative Hofgemeinschaft is a c...,"['Living Forms', 'Farm Cafe', 'Fresh Coffee', ...",Manufacturing,Farms & Agriculture Production,All Other Miscellaneous Crop Farming,"[Bakery Production Services, Gardening Service...",1.0,
2,PATAGONIA Chapa Y Pintura is an auto body shop...,"['Automotive Body Repair Services', 'Interior ...",Services,Auto Body Shops,"Automotive Body, Paint, and Interior Repair an...","[Interior Design Services, Painting Services, ...",1.0,
3,Stanica WODNA PTTK Swornegacie is a cultural e...,"['Cultural Activities', 'Accommodation Service...",Services,Boat Tours & Cruises,"Scenic and Sightseeing Transportation, Water","[Travel Services, Catering Services, Commercia...",1.0,
4,BIQ Benefícios is a Brazilian company that spe...,"['Healthy Food Options', 'Accredited Establish...",Services,Payment Processing & Point of Sale,"Financial Transactions Processing, Reserve, an...","[Food Processing Services, Health Promotion Se...",1.0,


In [281]:
combined_df = combined_df.drop(columns=['combined_text'])
combined_df.head()

Unnamed: 0,description,business_tags,sector,category,niche,labels,confidence
0,Welchcivils is a civil engineering and constru...,"['Construction Services', 'Multi-utilities', '...",Services,Civil Engineering Services,Other Heavy and Civil Engineering Construction,[Commercial Communication Equipment Installati...,1.0
1,Loidholdhof Integrative Hofgemeinschaft is a c...,"['Living Forms', 'Farm Cafe', 'Fresh Coffee', ...",Manufacturing,Farms & Agriculture Production,All Other Miscellaneous Crop Farming,"[Bakery Production Services, Gardening Service...",1.0
2,PATAGONIA Chapa Y Pintura is an auto body shop...,"['Automotive Body Repair Services', 'Interior ...",Services,Auto Body Shops,"Automotive Body, Paint, and Interior Repair an...","[Interior Design Services, Painting Services, ...",1.0
3,Stanica WODNA PTTK Swornegacie is a cultural e...,"['Cultural Activities', 'Accommodation Service...",Services,Boat Tours & Cruises,"Scenic and Sightseeing Transportation, Water","[Travel Services, Catering Services, Commercia...",1.0
4,BIQ Benefícios is a Brazilian company that spe...,"['Healthy Food Options', 'Accredited Establish...",Services,Payment Processing & Point of Sale,"Financial Transactions Processing, Reserve, an...","[Food Processing Services, Health Promotion Se...",1.0


In [282]:
combined_df.to_csv('all_companies_tagged.csv', index=False)